deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,66 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Optional, Dict, List, Union, Any
|
3
|
-
from deltacat.storage import (
|
4
|
-
Delta,
|
5
|
-
DeltaLocator,
|
6
|
-
interface as unimplemented_deltacat_storage,
|
7
|
-
)
|
8
|
-
|
9
|
-
|
10
|
-
class MergeOnReadParams(dict):
|
11
|
-
"""
|
12
|
-
This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
|
13
|
-
"""
|
14
|
-
|
15
|
-
@staticmethod
|
16
|
-
def of(params: Optional[Dict]) -> MergeOnReadParams:
|
17
|
-
params = {} if params is None else params
|
18
|
-
|
19
|
-
result = MergeOnReadParams(params)
|
20
|
-
assert result.deltas is not None, "deltas is a required arg"
|
21
|
-
|
22
|
-
result.deltacat_storage = params.get(
|
23
|
-
"deltacat_storage", unimplemented_deltacat_storage
|
24
|
-
)
|
25
|
-
result.reader_kwargs = params.get("reader_kwargs", {})
|
26
|
-
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
27
|
-
|
28
|
-
return result
|
29
|
-
|
30
|
-
@property
|
31
|
-
def deltas(self) -> List[Union[Delta, DeltaLocator]]:
|
32
|
-
"""
|
33
|
-
The list of deltas to compact in-memory.
|
34
|
-
"""
|
35
|
-
return self["deltas"]
|
36
|
-
|
37
|
-
@deltas.setter
|
38
|
-
def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
|
39
|
-
self["deltas"] = to_set
|
40
|
-
|
41
|
-
@property
|
42
|
-
def reader_kwargs(self) -> Dict[Any, Any]:
|
43
|
-
"""
|
44
|
-
The key word arguments to be passed to the reader.
|
45
|
-
"""
|
46
|
-
return self["reader_kwargs"]
|
47
|
-
|
48
|
-
@reader_kwargs.setter
|
49
|
-
def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
|
50
|
-
self["reader_kwargs"] = kwargs
|
51
|
-
|
52
|
-
@property
|
53
|
-
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
54
|
-
return self["deltacat_storage"]
|
55
|
-
|
56
|
-
@deltacat_storage.setter
|
57
|
-
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
58
|
-
self["deltacat_storage"] = storage
|
59
|
-
|
60
|
-
@property
|
61
|
-
def deltacat_storage_kwargs(self) -> dict:
|
62
|
-
return self["deltacat_storage_kwargs"]
|
63
|
-
|
64
|
-
@deltacat_storage_kwargs.setter
|
65
|
-
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
66
|
-
self["deltacat_storage_kwargs"] = kwargs
|
@@ -1,42 +0,0 @@
|
|
1
|
-
from typing import List, Dict, Any, Optional, Union
|
2
|
-
from deltacat.storage.model.delta import Delta, DeltaLocator
|
3
|
-
from deltacat.storage.model.types import DistributedDataset
|
4
|
-
from deltacat.storage import (
|
5
|
-
interface as unimplemented_deltacat_storage,
|
6
|
-
)
|
7
|
-
from deltacat.types.media import TableType, StorageType, DistributedDatasetType
|
8
|
-
|
9
|
-
|
10
|
-
def create_df_from_all_deltas(
|
11
|
-
deltas: List[Union[Delta, DeltaLocator]],
|
12
|
-
table_type: TableType,
|
13
|
-
distributed_dataset_type: DistributedDatasetType,
|
14
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
15
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
16
|
-
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
|
17
|
-
*args,
|
18
|
-
**kwargs
|
19
|
-
) -> List[DistributedDataset]: # type: ignore
|
20
|
-
"""
|
21
|
-
This method creates a distributed dataset for each delta and returns their references.
|
22
|
-
"""
|
23
|
-
|
24
|
-
if reader_kwargs is None:
|
25
|
-
reader_kwargs = {}
|
26
|
-
if deltacat_storage_kwargs is None:
|
27
|
-
deltacat_storage_kwargs = {}
|
28
|
-
|
29
|
-
df_list = []
|
30
|
-
|
31
|
-
for delta in deltas:
|
32
|
-
df = deltacat_storage.download_delta(
|
33
|
-
delta_like=delta,
|
34
|
-
table_type=table_type,
|
35
|
-
distributed_dataset_type=distributed_dataset_type,
|
36
|
-
storage_type=StorageType.DISTRIBUTED,
|
37
|
-
**reader_kwargs,
|
38
|
-
**deltacat_storage_kwargs
|
39
|
-
)
|
40
|
-
df_list.append(df)
|
41
|
-
|
42
|
-
return df_list
|
@@ -1,15 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import logging
|
3
|
-
import argparse
|
4
|
-
from deltacat import logs
|
5
|
-
|
6
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
7
|
-
|
8
|
-
|
9
|
-
def store_cli_args_in_os_environ(script_args_list=[]):
|
10
|
-
parser = argparse.ArgumentParser()
|
11
|
-
for args, kwargs in script_args_list:
|
12
|
-
parser.add_argument(*args, **kwargs)
|
13
|
-
args = parser.parse_args()
|
14
|
-
print(f"Command Line Arguments: {args}")
|
15
|
-
os.environ.update(vars(args))
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from pyiceberg.catalog import Catalog
|
4
|
-
from deltacat.storage.model.scan.push_down import Pushdown
|
5
|
-
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
|
-
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
7
|
-
from deltacat.storage.util.scan_planner import ScanPlanner
|
8
|
-
from deltacat.storage.iceberg.impl import _try_load_iceberg_table
|
9
|
-
|
10
|
-
|
11
|
-
class IcebergScanPlanner(ScanPlanner):
|
12
|
-
def __init__(self, catalog: Catalog):
|
13
|
-
self.catalog = catalog
|
14
|
-
|
15
|
-
def create_scan_plan(
|
16
|
-
self,
|
17
|
-
table_name: str,
|
18
|
-
namespace: Optional[str] = None,
|
19
|
-
pushdown: Optional[Pushdown] = None,
|
20
|
-
) -> ScanPlan:
|
21
|
-
iceberg_table = _try_load_iceberg_table(
|
22
|
-
self.catalog, namespace=namespace, table_name=table_name
|
23
|
-
)
|
24
|
-
file_scan_tasks = []
|
25
|
-
# TODO: implement predicate pushdown to Iceberg
|
26
|
-
for scan_task in iceberg_table.scan().plan_files():
|
27
|
-
file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
|
28
|
-
return ScanPlan(file_scan_tasks)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
# TODO later on this will be moved to a dedicated package
|
2
|
-
from deltacat.storage.rivulet.feather.file_reader import FeatherFileReader
|
3
|
-
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
4
|
-
|
5
|
-
FileReaderRegistrar.register_reader("feather", FeatherFileReader)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
# TODO later on this will be moved to a dedicated package
|
2
|
-
from deltacat.storage.rivulet.parquet.file_reader import ParquetFileReader
|
3
|
-
from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
|
4
|
-
|
5
|
-
FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
|
@@ -1,231 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import os
|
3
|
-
from moto import mock_s3
|
4
|
-
import boto3
|
5
|
-
from boto3.resources.base import ServiceResource
|
6
|
-
from deltacat.compute.compactor.utils.round_completion_file import (
|
7
|
-
read_round_completion_file,
|
8
|
-
write_round_completion_file,
|
9
|
-
)
|
10
|
-
from deltacat.tests.compute.test_util_common import get_test_partition_locator
|
11
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
12
|
-
|
13
|
-
RCF_BUCKET_NAME = "rcf-bucket"
|
14
|
-
|
15
|
-
|
16
|
-
@pytest.fixture(autouse=True, scope="module")
|
17
|
-
def mock_aws_credential():
|
18
|
-
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
19
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
20
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
21
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
22
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
23
|
-
yield
|
24
|
-
|
25
|
-
|
26
|
-
@pytest.fixture(autouse=True, scope="module")
|
27
|
-
def s3_resource(mock_aws_credential):
|
28
|
-
with mock_s3():
|
29
|
-
yield boto3.resource("s3")
|
30
|
-
|
31
|
-
|
32
|
-
@pytest.fixture(autouse=True, scope="function")
|
33
|
-
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
34
|
-
s3_resource.create_bucket(
|
35
|
-
ACL="authenticated-read",
|
36
|
-
Bucket=RCF_BUCKET_NAME,
|
37
|
-
)
|
38
|
-
yield
|
39
|
-
s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
|
40
|
-
|
41
|
-
|
42
|
-
class TestReadWriteRoundCompletionFile:
|
43
|
-
def test_read_when_rcf_written_without_destination(self):
|
44
|
-
"""
|
45
|
-
This test case tests the backward compatibility by successfully
|
46
|
-
reading the previously written rcf.
|
47
|
-
"""
|
48
|
-
|
49
|
-
source_locator = get_test_partition_locator("source")
|
50
|
-
destination_locator = get_test_partition_locator("destination")
|
51
|
-
|
52
|
-
expected_rcf = RoundCompletionInfo.of(
|
53
|
-
high_watermark=122,
|
54
|
-
compacted_delta_locator={},
|
55
|
-
compacted_pyarrow_write_result={},
|
56
|
-
sort_keys_bit_width=12,
|
57
|
-
)
|
58
|
-
|
59
|
-
rcf_url = write_round_completion_file(
|
60
|
-
RCF_BUCKET_NAME, source_locator, None, expected_rcf
|
61
|
-
)
|
62
|
-
|
63
|
-
rcf = read_round_completion_file(
|
64
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
65
|
-
)
|
66
|
-
|
67
|
-
assert (
|
68
|
-
rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
|
69
|
-
)
|
70
|
-
assert rcf == expected_rcf
|
71
|
-
|
72
|
-
def test_read_when_rcf_written_with_destination(self):
|
73
|
-
"""
|
74
|
-
This test case tests the backward compatibility by successfully
|
75
|
-
reading the previously written rcf.
|
76
|
-
"""
|
77
|
-
|
78
|
-
source_locator = get_test_partition_locator("source")
|
79
|
-
destination_locator = get_test_partition_locator("destination")
|
80
|
-
|
81
|
-
expected_rcf = RoundCompletionInfo.of(
|
82
|
-
high_watermark=122,
|
83
|
-
compacted_delta_locator={},
|
84
|
-
compacted_pyarrow_write_result={},
|
85
|
-
sort_keys_bit_width=12,
|
86
|
-
)
|
87
|
-
|
88
|
-
rcf_url = write_round_completion_file(
|
89
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
90
|
-
)
|
91
|
-
|
92
|
-
rcf = read_round_completion_file(
|
93
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
94
|
-
)
|
95
|
-
|
96
|
-
assert (
|
97
|
-
rcf_url
|
98
|
-
== "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
|
99
|
-
)
|
100
|
-
assert rcf == expected_rcf
|
101
|
-
|
102
|
-
def test_read_without_destination_when_rcf_written_with_destination(self):
|
103
|
-
"""
|
104
|
-
This test case tests the backward compatibility by successfully
|
105
|
-
reading the previously written rcf.
|
106
|
-
"""
|
107
|
-
|
108
|
-
source_locator = get_test_partition_locator("source")
|
109
|
-
destination_locator = get_test_partition_locator("destination")
|
110
|
-
|
111
|
-
expected_rcf = RoundCompletionInfo.of(
|
112
|
-
high_watermark=122,
|
113
|
-
compacted_delta_locator={},
|
114
|
-
compacted_pyarrow_write_result={},
|
115
|
-
sort_keys_bit_width=12,
|
116
|
-
)
|
117
|
-
|
118
|
-
write_round_completion_file(
|
119
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
120
|
-
)
|
121
|
-
|
122
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
123
|
-
|
124
|
-
assert rcf is None
|
125
|
-
|
126
|
-
def test_read_without_destination_when_rcf_written_without_destination(self):
|
127
|
-
"""
|
128
|
-
This test case tests the backward compatibility by successfully
|
129
|
-
reading the previously written rcf.
|
130
|
-
"""
|
131
|
-
|
132
|
-
source_locator = get_test_partition_locator("source")
|
133
|
-
|
134
|
-
expected_rcf = RoundCompletionInfo.of(
|
135
|
-
high_watermark=122,
|
136
|
-
compacted_delta_locator={},
|
137
|
-
compacted_pyarrow_write_result={},
|
138
|
-
sort_keys_bit_width=12,
|
139
|
-
)
|
140
|
-
|
141
|
-
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
142
|
-
|
143
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
144
|
-
|
145
|
-
assert rcf == expected_rcf
|
146
|
-
|
147
|
-
def test_read_when_rcf_written_both_with_and_without_destination(self):
|
148
|
-
"""
|
149
|
-
This test case tests the backward compatibility by successfully
|
150
|
-
reading the previously written rcf.
|
151
|
-
"""
|
152
|
-
|
153
|
-
source_locator = get_test_partition_locator("source")
|
154
|
-
destination_locator = get_test_partition_locator("destination")
|
155
|
-
|
156
|
-
expected_rcf = RoundCompletionInfo.of(
|
157
|
-
high_watermark=122,
|
158
|
-
compacted_delta_locator={},
|
159
|
-
compacted_pyarrow_write_result={},
|
160
|
-
sort_keys_bit_width=12,
|
161
|
-
)
|
162
|
-
|
163
|
-
expected_rcf_2 = RoundCompletionInfo.of(
|
164
|
-
high_watermark=1223,
|
165
|
-
compacted_delta_locator={},
|
166
|
-
compacted_pyarrow_write_result={},
|
167
|
-
sort_keys_bit_width=1233,
|
168
|
-
)
|
169
|
-
|
170
|
-
write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
|
171
|
-
|
172
|
-
write_round_completion_file(
|
173
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
|
174
|
-
)
|
175
|
-
|
176
|
-
rcf = read_round_completion_file(
|
177
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
178
|
-
)
|
179
|
-
|
180
|
-
assert rcf == expected_rcf_2
|
181
|
-
|
182
|
-
def test_read_when_none_destination_partition_id(self):
|
183
|
-
|
184
|
-
source_locator = get_test_partition_locator("source")
|
185
|
-
destination_locator = get_test_partition_locator(None)
|
186
|
-
|
187
|
-
expected_rcf = RoundCompletionInfo.of(
|
188
|
-
high_watermark=122,
|
189
|
-
compacted_delta_locator={},
|
190
|
-
compacted_pyarrow_write_result={},
|
191
|
-
sort_keys_bit_width=12,
|
192
|
-
)
|
193
|
-
|
194
|
-
write_round_completion_file(
|
195
|
-
RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
|
196
|
-
)
|
197
|
-
|
198
|
-
rcf = read_round_completion_file(
|
199
|
-
RCF_BUCKET_NAME, source_locator, destination_locator
|
200
|
-
)
|
201
|
-
|
202
|
-
assert rcf == expected_rcf
|
203
|
-
|
204
|
-
def test_write_when_custom_url_is_passed(self):
|
205
|
-
"""
|
206
|
-
This test case tests the backward compatibility by successfully
|
207
|
-
reading the previously written rcf.
|
208
|
-
"""
|
209
|
-
|
210
|
-
source_locator = get_test_partition_locator("source")
|
211
|
-
|
212
|
-
expected_rcf = RoundCompletionInfo.of(
|
213
|
-
high_watermark=122,
|
214
|
-
compacted_delta_locator={},
|
215
|
-
compacted_pyarrow_write_result={},
|
216
|
-
sort_keys_bit_width=12,
|
217
|
-
)
|
218
|
-
|
219
|
-
completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
|
220
|
-
rcf_url = write_round_completion_file(
|
221
|
-
RCF_BUCKET_NAME,
|
222
|
-
source_locator,
|
223
|
-
None,
|
224
|
-
expected_rcf,
|
225
|
-
completion_file_s3_url=completion_file_s3_url,
|
226
|
-
)
|
227
|
-
|
228
|
-
rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
|
229
|
-
|
230
|
-
assert rcf_url == completion_file_s3_url
|
231
|
-
assert rcf is None
|