deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,66 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Optional, Dict, List, Union, Any
|
3
|
-
from deltacat.storage import (
|
4
|
-
Delta,
|
5
|
-
DeltaLocator,
|
6
|
-
interface as unimplemented_deltacat_storage,
|
7
|
-
)
|
8
|
-
|
9
|
-
|
10
|
-
class MergeOnReadParams(dict):
|
11
|
-
"""
|
12
|
-
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
-
"""
|
14
|
-
|
15
|
-
@staticmethod
|
16
|
-
def of(params: Optional[Dict]) -> MergeOnReadParams:
|
17
|
-
params = {} if params is None else params
|
18
|
-
|
19
|
-
result = MergeOnReadParams(params)
|
20
|
-
assert result.deltas is not None, "deltas is a required arg"
|
21
|
-
|
22
|
-
result.deltacat_storage = params.get(
|
23
|
-
"deltacat_storage", unimplemented_deltacat_storage
|
24
|
-
)
|
25
|
-
result.reader_kwargs = params.get("reader_kwargs", {})
|
26
|
-
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
27
|
-
|
28
|
-
return result
|
29
|
-
|
30
|
-
@property
|
31
|
-
def deltas(self) -> List[Union[Delta, DeltaLocator]]:
|
32
|
-
"""
|
33
|
-
The list of deltas to compact in-memory.
|
34
|
-
"""
|
35
|
-
return self["deltas"]
|
36
|
-
|
37
|
-
@deltas.setter
|
38
|
-
def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
|
39
|
-
self["deltas"] = to_set
|
40
|
-
|
41
|
-
@property
|
42
|
-
def reader_kwargs(self) -> Dict[Any, Any]:
|
43
|
-
"""
|
44
|
-
The key word arguments to be passed to the reader.
|
45
|
-
"""
|
46
|
-
return self["reader_kwargs"]
|
47
|
-
|
48
|
-
@reader_kwargs.setter
|
49
|
-
def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
|
50
|
-
self["reader_kwargs"] = kwargs
|
51
|
-
|
52
|
-
@property
|
53
|
-
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
54
|
-
return self["deltacat_storage"]
|
55
|
-
|
56
|
-
@deltacat_storage.setter
|
57
|
-
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
58
|
-
self["deltacat_storage"] = storage
|
59
|
-
|
60
|
-
@property
|
61
|
-
def deltacat_storage_kwargs(self) -> dict:
|
62
|
-
return self["deltacat_storage_kwargs"]
|
63
|
-
|
64
|
-
@deltacat_storage_kwargs.setter
|
65
|
-
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
66
|
-
self["deltacat_storage_kwargs"] = kwargs
|
@@ -1,42 +0,0 @@
|
|
1
|
-
from typing import List, Dict, Any, Optional, Union
|
2
|
-
from deltacat.storage.model.delta import Delta, DeltaLocator
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.storage import (
|
5
|
-
interface as unimplemented_deltacat_storage,
|
6
|
-
)
|
7
|
-
from deltacat.types.media import TableType, StorageType, DistributedDatasetType
|
8
|
-
|
9
|
-
|
10
|
-
def create_df_from_all_deltas(
|
11
|
-
deltas: List[Union[Delta, DeltaLocator]],
|
12
|
-
table_type: TableType,
|
13
|
-
distributed_dataset_type: DistributedDatasetType,
|
14
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
15
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
16
|
-
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
17
|
-
*args,
|
18
|
-
**kwargs
|
19
|
-
) -> List[DistributedDataset]: # type: ignore
|
20
|
-
"""
|
21
|
-
This method creates a distributed dataset for each delta and returns their references.
|
22
|
-
"""
|
23
|
-
|
24
|
-
if reader_kwargs is None:
|
25
|
-
reader_kwargs = {}
|
26
|
-
if deltacat_storage_kwargs is None:
|
27
|
-
deltacat_storage_kwargs = {}
|
28
|
-
|
29
|
-
df_list = []
|
30
|
-
|
31
|
-
for delta in deltas:
|
32
|
-
df = deltacat_storage.download_delta(
|
33
|
-
delta_like=delta,
|
34
|
-
table_type=table_type,
|
35
|
-
distributed_dataset_type=distributed_dataset_type,
|
36
|
-
storage_type=StorageType.DISTRIBUTED,
|
37
|
-
**reader_kwargs,
|
38
|
-
**deltacat_storage_kwargs
|
39
|
-
)
|
40
|
-
df_list.append(df)
|
41
|
-
|
42
|
-
return df_list
|
deltacat/daft/daft_scan.py
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
from typing import Iterator
|
2
|
-
|
3
|
-
from daft import Schema
|
4
|
-
from daft.daft import (
|
5
|
-
StorageConfig,
|
6
|
-
PartitionField,
|
7
|
-
Pushdowns,
|
8
|
-
ScanTask,
|
9
|
-
FileFormatConfig,
|
10
|
-
ParquetSourceConfig,
|
11
|
-
)
|
12
|
-
from daft.io.scan import ScanOperator, ScanPushdowns
|
13
|
-
|
14
|
-
from deltacat.catalog.model.table_definition import TableDefinition
|
15
|
-
from deltacat.daft.model import DaftPartitionKeyMapper
|
16
|
-
from deltacat.daft.translator import translate_pushdown
|
17
|
-
|
18
|
-
|
19
|
-
class DeltaCatScanOperator(ScanOperator):
|
20
|
-
def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
|
21
|
-
super().__init__()
|
22
|
-
self.table = table
|
23
|
-
self._schema = self._infer_schema()
|
24
|
-
self.partition_keys = self._infer_partition_keys()
|
25
|
-
self.storage_config = storage_config
|
26
|
-
|
27
|
-
def schema(self) -> Schema:
|
28
|
-
return self._schema
|
29
|
-
|
30
|
-
def name(self) -> str:
|
31
|
-
return "DeltaCatScanOperator"
|
32
|
-
|
33
|
-
def display_name(self) -> str:
|
34
|
-
return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
|
35
|
-
|
36
|
-
def partitioning_keys(self) -> list[PartitionField]:
|
37
|
-
return self.partition_keys
|
38
|
-
|
39
|
-
def multiline_display(self) -> list[str]:
|
40
|
-
return [
|
41
|
-
self.display_name(),
|
42
|
-
f"Schema = {self._schema}",
|
43
|
-
f"Partitioning keys = {self.partitioning_keys}",
|
44
|
-
f"Storage config = {self.storage_config}",
|
45
|
-
]
|
46
|
-
|
47
|
-
def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]:
|
48
|
-
daft_pushdowns = ScanPushdowns._from_pypushdowns(
|
49
|
-
pushdowns, schema=self.schema()
|
50
|
-
)
|
51
|
-
dc_pushdown = translate_pushdown(daft_pushdowns)
|
52
|
-
dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
|
53
|
-
scan_tasks = []
|
54
|
-
file_format_config = FileFormatConfig.from_parquet_config(
|
55
|
-
# maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
|
56
|
-
ParquetSourceConfig()
|
57
|
-
)
|
58
|
-
for dc_scan_task in dc_scan_plan.scan_tasks:
|
59
|
-
for data_file in dc_scan_task.data_files():
|
60
|
-
st = ScanTask.catalog_scan_task(
|
61
|
-
file=data_file.file_path,
|
62
|
-
file_format=file_format_config,
|
63
|
-
schema=self._schema._schema,
|
64
|
-
storage_config=self.storage_config,
|
65
|
-
pushdowns=pushdowns,
|
66
|
-
)
|
67
|
-
scan_tasks.append(st)
|
68
|
-
return iter(scan_tasks)
|
69
|
-
|
70
|
-
def can_absorb_filter(self) -> bool:
|
71
|
-
return False
|
72
|
-
|
73
|
-
def can_absorb_limit(self) -> bool:
|
74
|
-
return False
|
75
|
-
|
76
|
-
def can_absorb_select(self) -> bool:
|
77
|
-
return True
|
78
|
-
|
79
|
-
def _infer_schema(self) -> Schema:
|
80
|
-
|
81
|
-
if not (
|
82
|
-
self.table and self.table.table_version and self.table.table_version.schema
|
83
|
-
):
|
84
|
-
raise RuntimeError(
|
85
|
-
f"Failed to infer schema for DeltaCAT Table "
|
86
|
-
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
87
|
-
)
|
88
|
-
|
89
|
-
return Schema.from_pyarrow_schema(self.table.table_version.schema.arrow)
|
90
|
-
|
91
|
-
def _infer_partition_keys(self) -> list[PartitionField]:
|
92
|
-
if not (
|
93
|
-
self.table
|
94
|
-
and self.table.table_version
|
95
|
-
and self.table.table_version.partition_scheme
|
96
|
-
and self.table.table_version.schema
|
97
|
-
):
|
98
|
-
raise RuntimeError(
|
99
|
-
f"Failed to infer partition keys for DeltaCAT Table "
|
100
|
-
f"{self.table.table.namespace}.{self.table.table.table_name}"
|
101
|
-
)
|
102
|
-
|
103
|
-
schema = self.table.table_version.schema
|
104
|
-
partition_keys = self.table.table_version.partition_scheme.keys
|
105
|
-
if not partition_keys:
|
106
|
-
return []
|
107
|
-
|
108
|
-
partition_fields = []
|
109
|
-
for key in partition_keys:
|
110
|
-
field = DaftPartitionKeyMapper.unmap(key, schema)
|
111
|
-
# Assert that the returned value is not None.
|
112
|
-
assert field is not None, f"Unmapping failed for key {key}"
|
113
|
-
partition_fields.append(field)
|
114
|
-
|
115
|
-
return partition_fields
|
deltacat/daft/model.py
DELETED
@@ -1,258 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
import pyarrow as pa
|
4
|
-
from pyarrow import Field as PaField
|
5
|
-
from daft import Schema as DaftSchema, DataType
|
6
|
-
from daft.daft import (
|
7
|
-
PartitionField as DaftPartitionField,
|
8
|
-
PartitionTransform as DaftTransform,
|
9
|
-
)
|
10
|
-
from daft.logical.schema import Field as DaftField
|
11
|
-
from daft.io.scan import make_partition_field
|
12
|
-
|
13
|
-
from deltacat.storage.model.schema import Schema
|
14
|
-
from deltacat.storage.model.interop import ModelMapper
|
15
|
-
from deltacat.storage.model.partition import PartitionKey
|
16
|
-
from deltacat.storage.model.transform import (
|
17
|
-
BucketingStrategy,
|
18
|
-
Transform,
|
19
|
-
BucketTransform,
|
20
|
-
HourTransform,
|
21
|
-
DayTransform,
|
22
|
-
MonthTransform,
|
23
|
-
YearTransform,
|
24
|
-
IdentityTransform,
|
25
|
-
TruncateTransform,
|
26
|
-
)
|
27
|
-
|
28
|
-
|
29
|
-
class DaftFieldMapper(ModelMapper[DaftField, PaField]):
|
30
|
-
@staticmethod
|
31
|
-
def map(
|
32
|
-
obj: Optional[DaftField],
|
33
|
-
**kwargs,
|
34
|
-
) -> Optional[PaField]:
|
35
|
-
"""Convert Daft Field to PyArrow Field.
|
36
|
-
|
37
|
-
Args:
|
38
|
-
obj: The Daft Field to convert
|
39
|
-
**kwargs: Additional arguments
|
40
|
-
|
41
|
-
Returns:
|
42
|
-
Converted PyArrow Field object
|
43
|
-
"""
|
44
|
-
if obj is None:
|
45
|
-
return None
|
46
|
-
|
47
|
-
return pa.field(
|
48
|
-
name=obj.name,
|
49
|
-
type=obj.dtype.to_arrow_dtype(),
|
50
|
-
)
|
51
|
-
|
52
|
-
@staticmethod
|
53
|
-
def unmap(
|
54
|
-
obj: Optional[PaField],
|
55
|
-
**kwargs,
|
56
|
-
) -> Optional[DaftField]:
|
57
|
-
"""Convert PyArrow Field to Daft Field.
|
58
|
-
|
59
|
-
Args:
|
60
|
-
obj: The PyArrow Field to convert
|
61
|
-
**kwargs: Additional arguments
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
Converted Daft Field object
|
65
|
-
"""
|
66
|
-
if obj is None:
|
67
|
-
return None
|
68
|
-
|
69
|
-
return DaftField.create(
|
70
|
-
name=obj.name,
|
71
|
-
dtype=DataType.from_arrow_type(obj.type), # type: ignore
|
72
|
-
)
|
73
|
-
|
74
|
-
|
75
|
-
class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
|
76
|
-
@staticmethod
|
77
|
-
def map(
|
78
|
-
obj: Optional[DaftTransform],
|
79
|
-
**kwargs,
|
80
|
-
) -> Optional[Transform]:
|
81
|
-
"""Convert DaftTransform to DeltaCAT Transform.
|
82
|
-
|
83
|
-
Args:
|
84
|
-
obj: The DaftTransform to convert
|
85
|
-
**kwargs: Additional arguments
|
86
|
-
|
87
|
-
Returns:
|
88
|
-
Converted Transform object
|
89
|
-
"""
|
90
|
-
|
91
|
-
# daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
|
92
|
-
# thus conversion is not possible.
|
93
|
-
# TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
|
94
|
-
raise NotImplementedError(
|
95
|
-
"Converting transform from Daft to DeltaCAT is not supported"
|
96
|
-
)
|
97
|
-
|
98
|
-
@staticmethod
|
99
|
-
def unmap(
|
100
|
-
obj: Optional[Transform],
|
101
|
-
**kwargs,
|
102
|
-
) -> Optional[DaftTransform]:
|
103
|
-
"""Convert DeltaCAT Transform to DaftTransform.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
obj: The Transform to convert
|
107
|
-
**kwargs: Additional arguments
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
Converted DaftTransform object
|
111
|
-
"""
|
112
|
-
if obj is None:
|
113
|
-
return None
|
114
|
-
|
115
|
-
# Map DeltaCAT transforms to Daft transforms using isinstance
|
116
|
-
|
117
|
-
if isinstance(obj, IdentityTransform):
|
118
|
-
return DaftTransform.identity()
|
119
|
-
elif isinstance(obj, HourTransform):
|
120
|
-
return DaftTransform.hour()
|
121
|
-
elif isinstance(obj, DayTransform):
|
122
|
-
return DaftTransform.day()
|
123
|
-
elif isinstance(obj, MonthTransform):
|
124
|
-
return DaftTransform.month()
|
125
|
-
elif isinstance(obj, YearTransform):
|
126
|
-
return DaftTransform.year()
|
127
|
-
elif isinstance(obj, BucketTransform):
|
128
|
-
if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
|
129
|
-
return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
|
130
|
-
else:
|
131
|
-
raise ValueError(
|
132
|
-
f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
|
133
|
-
)
|
134
|
-
elif isinstance(obj, TruncateTransform):
|
135
|
-
return DaftTransform.iceberg_truncate(obj.parameters.width)
|
136
|
-
|
137
|
-
raise ValueError(f"Unsupported Transform: {obj}")
|
138
|
-
|
139
|
-
|
140
|
-
class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
|
141
|
-
@staticmethod
|
142
|
-
def map(
|
143
|
-
obj: Optional[DaftPartitionField],
|
144
|
-
schema: Optional[DaftSchema] = None,
|
145
|
-
**kwargs,
|
146
|
-
) -> Optional[PartitionKey]:
|
147
|
-
"""Convert DaftPartitionField to PartitionKey.
|
148
|
-
|
149
|
-
Args:
|
150
|
-
obj: The DaftPartitionField to convert
|
151
|
-
schema: The Daft schema containing field information
|
152
|
-
**kwargs: Additional arguments
|
153
|
-
|
154
|
-
Returns:
|
155
|
-
Converted PartitionKey object
|
156
|
-
"""
|
157
|
-
# Daft PartitionField only exposes 1 attribute `field` which is not enough
|
158
|
-
# to convert to DeltaCAT PartitionKey
|
159
|
-
# TODO: request Daft to expose more Python friendly interface for PartitionField
|
160
|
-
raise NotImplementedError(
|
161
|
-
f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
|
162
|
-
)
|
163
|
-
|
164
|
-
@staticmethod
|
165
|
-
def unmap(
|
166
|
-
obj: Optional[PartitionKey],
|
167
|
-
schema: Optional[Schema] = None,
|
168
|
-
**kwargs,
|
169
|
-
) -> Optional[DaftPartitionField]:
|
170
|
-
"""Convert PartitionKey to DaftPartitionField.
|
171
|
-
|
172
|
-
Args:
|
173
|
-
obj: The DeltaCAT PartitionKey to convert
|
174
|
-
schema: The Schema containing field information
|
175
|
-
**kwargs: Additional arguments
|
176
|
-
|
177
|
-
Returns:
|
178
|
-
Converted DaftPartitionField object
|
179
|
-
"""
|
180
|
-
if obj is None:
|
181
|
-
return None
|
182
|
-
if obj.name is None:
|
183
|
-
raise ValueError("Name is required for PartitionKey conversion")
|
184
|
-
if not schema:
|
185
|
-
raise ValueError("Schema is required for PartitionKey conversion")
|
186
|
-
if len(obj.key) < 1:
|
187
|
-
raise ValueError(
|
188
|
-
f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
|
189
|
-
)
|
190
|
-
|
191
|
-
# Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
|
192
|
-
dc_source_field = schema.field(obj.key[0]).arrow
|
193
|
-
daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
|
194
|
-
# Convert transform if present
|
195
|
-
daft_transform = DaftTransformMapper.unmap(obj.transform)
|
196
|
-
daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
|
197
|
-
partition_field_name=obj.name,
|
198
|
-
daft_source_field=daft_source_field,
|
199
|
-
dc_transform=obj.transform,
|
200
|
-
)
|
201
|
-
|
202
|
-
# Create DaftPartitionField
|
203
|
-
return make_partition_field(
|
204
|
-
field=daft_partition_field,
|
205
|
-
source_field=daft_source_field,
|
206
|
-
transform=daft_transform,
|
207
|
-
)
|
208
|
-
|
209
|
-
@staticmethod
|
210
|
-
def get_daft_partition_field(
|
211
|
-
partition_field_name: str,
|
212
|
-
daft_source_field: Optional[DaftField],
|
213
|
-
# TODO: replace DeltaCAT transform with Daft Transform for uniformality
|
214
|
-
# We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
|
215
|
-
# access its attributes.
|
216
|
-
# TODO: request Daft to provide a more python friendly interface for Daft Tranform
|
217
|
-
dc_transform: Optional[Transform],
|
218
|
-
) -> DaftField:
|
219
|
-
"""Generate Daft Partition Field given partition field name, source field and transform.
|
220
|
-
Partition field type is inferred using source field type and transform.
|
221
|
-
|
222
|
-
Args:
|
223
|
-
partition_field_name (str): the specified result field name
|
224
|
-
daft_source_field (DaftField): the source field of the partition field
|
225
|
-
daft_transform (DaftTransform): transform applied on the source field to create partition field
|
226
|
-
|
227
|
-
Returns:
|
228
|
-
DaftField: Daft Field representing the partition field
|
229
|
-
"""
|
230
|
-
if daft_source_field is None:
|
231
|
-
raise ValueError("Source field is required for PartitionField conversion")
|
232
|
-
if dc_transform is None:
|
233
|
-
raise ValueError("Transform is required for PartitionField conversion")
|
234
|
-
|
235
|
-
result_type = None
|
236
|
-
# Below type conversion logic references Daft - Iceberg conversion logic:
|
237
|
-
# https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
|
238
|
-
if isinstance(dc_transform, IdentityTransform):
|
239
|
-
result_type = daft_source_field.dtype
|
240
|
-
elif isinstance(dc_transform, YearTransform):
|
241
|
-
result_type = DataType.int32()
|
242
|
-
elif isinstance(dc_transform, MonthTransform):
|
243
|
-
result_type = DataType.int32()
|
244
|
-
elif isinstance(dc_transform, DayTransform):
|
245
|
-
result_type = DataType.int32()
|
246
|
-
elif isinstance(dc_transform, HourTransform):
|
247
|
-
result_type = DataType.int32()
|
248
|
-
elif isinstance(dc_transform, BucketTransform):
|
249
|
-
result_type = DataType.int32()
|
250
|
-
elif isinstance(dc_transform, TruncateTransform):
|
251
|
-
result_type = daft_source_field.dtype
|
252
|
-
else:
|
253
|
-
raise ValueError(f"Unsupported transform: {dc_transform}")
|
254
|
-
|
255
|
-
return DaftField.create(
|
256
|
-
name=partition_field_name,
|
257
|
-
dtype=result_type,
|
258
|
-
)
|
deltacat/daft/translator.py
DELETED
@@ -1,126 +0,0 @@
|
|
1
|
-
from daft.io.scan import ScanPushdowns
|
2
|
-
import pyarrow as pa
|
3
|
-
from typing import Callable, Dict
|
4
|
-
from daft.io.pushdowns import (
|
5
|
-
Expr as DaftExpr,
|
6
|
-
Literal as DaftLiteral,
|
7
|
-
Reference as DaftReference,
|
8
|
-
TermVisitor,
|
9
|
-
)
|
10
|
-
|
11
|
-
from deltacat.storage.model.expression import (
|
12
|
-
Expression,
|
13
|
-
Reference,
|
14
|
-
Literal,
|
15
|
-
Equal,
|
16
|
-
NotEqual,
|
17
|
-
GreaterThan,
|
18
|
-
LessThan,
|
19
|
-
GreaterThanEqual,
|
20
|
-
LessThanEqual,
|
21
|
-
And,
|
22
|
-
Or,
|
23
|
-
Not,
|
24
|
-
IsNull,
|
25
|
-
)
|
26
|
-
from deltacat.storage.model.scan.push_down import PartitionFilter, Pushdown
|
27
|
-
|
28
|
-
|
29
|
-
def translate_pushdown(pushdown: ScanPushdowns) -> Pushdown:
|
30
|
-
"""
|
31
|
-
Helper method to translate a Daft ScanPushdowns object into a Deltacat Pushdown.
|
32
|
-
|
33
|
-
Args:
|
34
|
-
pushdown: Daft ScanPushdowns object
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
Pushdown: Deltacat Pushdown object with translated filters
|
38
|
-
"""
|
39
|
-
translator = DaftToDeltacatExpressionTranslator()
|
40
|
-
partition_filter = None
|
41
|
-
|
42
|
-
if pushdown.predicate:
|
43
|
-
predicate = translator.visit(pushdown.predicate, None)
|
44
|
-
partition_filter = PartitionFilter.of(predicate)
|
45
|
-
|
46
|
-
# TODO: translate other pushdown filters
|
47
|
-
return Pushdown.of(
|
48
|
-
row_filter=None,
|
49
|
-
column_filter=None,
|
50
|
-
partition_filter=partition_filter,
|
51
|
-
limit=None,
|
52
|
-
)
|
53
|
-
|
54
|
-
|
55
|
-
class DaftToDeltacatExpressionTranslator(TermVisitor[None, Expression]):
|
56
|
-
"""
|
57
|
-
This visitor implementation traverses a Daft expression tree and produces
|
58
|
-
an equivalent Deltacat expression tree for use in Deltacat's query pushdown
|
59
|
-
system.
|
60
|
-
"""
|
61
|
-
|
62
|
-
_PROCEDURES: Dict[str, Callable[..., Expression]] = {
|
63
|
-
# Comparison predicates
|
64
|
-
"=": Equal.of,
|
65
|
-
"!=": NotEqual.of,
|
66
|
-
"<": LessThan.of,
|
67
|
-
">": GreaterThan.of,
|
68
|
-
"<=": LessThanEqual.of,
|
69
|
-
">=": GreaterThanEqual.of,
|
70
|
-
# Logical predicates
|
71
|
-
"and": And.of,
|
72
|
-
"or": Or.of,
|
73
|
-
"not": Not.of,
|
74
|
-
# Special operations
|
75
|
-
"is_null": IsNull.of,
|
76
|
-
}
|
77
|
-
|
78
|
-
def visit_reference(self, term: DaftReference, context: None) -> Expression:
|
79
|
-
"""
|
80
|
-
Convert Daft Reference to Deltacat Reference.
|
81
|
-
|
82
|
-
Args:
|
83
|
-
term: A Daft Reference expression representing a field or column.
|
84
|
-
context: Not used in this visitor implementation.
|
85
|
-
|
86
|
-
Returns:
|
87
|
-
DeltacatExpression: A Deltacat Reference expression for the same field.
|
88
|
-
"""
|
89
|
-
return Reference(term.path)
|
90
|
-
|
91
|
-
def visit_literal(self, term: DaftLiteral, context: None) -> Expression:
|
92
|
-
"""
|
93
|
-
Convert Daft Literal to Deltacat Literal.
|
94
|
-
|
95
|
-
Args:
|
96
|
-
term: A Daft Literal expression representing a constant value.
|
97
|
-
context: Not used in this visitor implementation.
|
98
|
-
|
99
|
-
Returns:
|
100
|
-
DeltacatExpression: A Deltacat Literal expression wrapping the same value as a PyArrow scalar.
|
101
|
-
"""
|
102
|
-
return Literal(pa.scalar(term.value))
|
103
|
-
|
104
|
-
def visit_expr(self, term: DaftExpr, context: None) -> Expression:
|
105
|
-
"""
|
106
|
-
This method handles the translation of procedure calls (operations) from
|
107
|
-
Daft to Deltacat, including special cases for IN, BETWEEN, and LIKE.
|
108
|
-
|
109
|
-
Args:
|
110
|
-
term: A Daft Expr expression representing an operation.
|
111
|
-
context: Not used in this visitor implementation.
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
DeltacatExpression: An equivalent Deltacat expression.
|
115
|
-
|
116
|
-
Raises:
|
117
|
-
ValueError: If the operation has an invalid number of arguments or
|
118
|
-
if the operation is not supported by Deltacat.
|
119
|
-
"""
|
120
|
-
proc = term.proc
|
121
|
-
args = [self.visit(arg.term, context) for arg in term.args]
|
122
|
-
|
123
|
-
if proc not in self._PROCEDURES:
|
124
|
-
raise ValueError(f"Deltacat does not support procedure '{proc}'.")
|
125
|
-
|
126
|
-
return self._PROCEDURES[proc](*args)
|
@@ -1,15 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import logging
|
3
|
-
import argparse
|
4
|
-
from deltacat import logs
|
5
|
-
|
6
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
7
|
-
|
8
|
-
|
9
|
-
def store_cli_args_in_os_environ(script_args_list=[]):
|
10
|
-
parser = argparse.ArgumentParser()
|
11
|
-
for args, kwargs in script_args_list:
|
12
|
-
parser.add_argument(*args, **kwargs)
|
13
|
-
args = parser.parse_args()
|
14
|
-
print(f"Command Line Arguments: {args}")
|
15
|
-
os.environ.update(vars(args))
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from pyiceberg.catalog import Catalog
|
4
|
-
from deltacat.storage.model.scan.push_down import Pushdown
|
5
|
-
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
|
-
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
7
|
-
from deltacat.storage.util.scan_planner import ScanPlanner
|
8
|
-
from deltacat.storage.iceberg.impl import _try_load_iceberg_table
|
9
|
-
|
10
|
-
|
11
|
-
class IcebergScanPlanner(ScanPlanner):
|
12
|
-
def __init__(self, catalog: Catalog):
|
13
|
-
self.catalog = catalog
|
14
|
-
|
15
|
-
def create_scan_plan(
|
16
|
-
self,
|
17
|
-
table_name: str,
|
18
|
-
namespace: Optional[str] = None,
|
19
|
-
pushdown: Optional[Pushdown] = None,
|
20
|
-
) -> ScanPlan:
|
21
|
-
iceberg_table = _try_load_iceberg_table(
|
22
|
-
self.catalog, namespace=namespace, table_name=table_name
|
23
|
-
)
|
24
|
-
file_scan_tasks = []
|
25
|
-
# TODO: implement predicate pushdown to Iceberg
|
26
|
-
for scan_task in iceberg_table.scan().plan_files():
|
27
|
-
file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
|
28
|
-
return ScanPlan(file_scan_tasks)
|
@@ -1,11 +0,0 @@
|
|
1
|
-
from deltacat.storage.rivulet.schema.schema import Schema
|
2
|
-
from deltacat.storage.rivulet.schema.schema import Field
|
3
|
-
from deltacat.storage.rivulet.dataset import Dataset
|
4
|
-
from deltacat.storage.rivulet.schema.schema import Datatype
|
5
|
-
|
6
|
-
__all__ = [
|
7
|
-
"Schema",
|
8
|
-
"Field",
|
9
|
-
"Dataset",
|
10
|
-
"Datatype",
|
11
|
-
]
|
@@ -1,5 +0,0 @@
|
|
1
|
-
# TODO later on this will be moved to a dedicated package
|
2
|
-
from deltacat.storage.rivulet.feather.file_reader import FeatherFileReader
|
3
|
-
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
4
|
-
|
5
|
-
FileReaderRegistrar.register_reader("feather", FeatherFileReader)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
# TODO later on this will be moved to a dedicated package
|
2
|
-
from deltacat.storage.rivulet.parquet.file_reader import ParquetFileReader
|
3
|
-
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
4
|
-
|
5
|
-
FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
|