deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,13 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import List, Dict
|
2
|
+
from collections import defaultdict
|
2
3
|
import uuid
|
4
|
+
from pyiceberg.table import Table
|
5
|
+
from pyiceberg.table.metadata import TableMetadata
|
3
6
|
from pyiceberg.table.snapshots import (
|
4
7
|
Operation,
|
5
8
|
)
|
6
9
|
from pyiceberg.manifest import (
|
10
|
+
DataFile,
|
7
11
|
DataFileContent,
|
8
12
|
ManifestContent,
|
9
13
|
ManifestEntry,
|
@@ -13,71 +17,116 @@ from pyiceberg.manifest import (
|
|
13
17
|
)
|
14
18
|
import itertools
|
15
19
|
from pyiceberg.utils.concurrent import ExecutorFactory
|
16
|
-
from pyiceberg.table.update.snapshot import
|
20
|
+
from pyiceberg.table.update.snapshot import _SnapshotProducer, UpdateSnapshot
|
17
21
|
|
18
22
|
|
19
|
-
|
20
|
-
|
23
|
+
def replace_delete_files_override(
|
24
|
+
update_snapshot: UpdateSnapshot,
|
25
|
+
) -> "_ReplaceDeleteFilesOverride":
|
26
|
+
commit_uuid = uuid.uuid4()
|
27
|
+
return _ReplaceDeleteFilesOverride(
|
28
|
+
commit_uuid=commit_uuid,
|
29
|
+
operation=Operation.OVERWRITE,
|
30
|
+
transaction=update_snapshot._transaction,
|
31
|
+
io=update_snapshot._io,
|
32
|
+
snapshot_properties=update_snapshot._snapshot_properties,
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
class _ReplaceDeleteFilesOverride(_SnapshotProducer):
|
37
|
+
def _manifests(self) -> List[ManifestFile]:
|
38
|
+
def _write_added_manifest() -> List[ManifestFile]:
|
39
|
+
if self._added_data_files:
|
40
|
+
with write_manifest(
|
41
|
+
format_version=self._transaction.table_metadata.format_version,
|
42
|
+
spec=self._transaction.table_metadata.spec(),
|
43
|
+
schema=self._transaction.table_metadata.schema(),
|
44
|
+
output_file=self.new_manifest_output(),
|
45
|
+
snapshot_id=self._snapshot_id,
|
46
|
+
) as writer:
|
47
|
+
for data_file in self._added_data_files:
|
48
|
+
writer.add(
|
49
|
+
ManifestEntry(
|
50
|
+
status=ManifestEntryStatus.ADDED,
|
51
|
+
snapshot_id=self._snapshot_id,
|
52
|
+
sequence_number=None,
|
53
|
+
file_sequence_number=None,
|
54
|
+
data_file=data_file,
|
55
|
+
)
|
56
|
+
)
|
57
|
+
writer.content = self.writer_content
|
58
|
+
return [writer.to_manifest_file()]
|
59
|
+
else:
|
60
|
+
return []
|
61
|
+
|
62
|
+
def _write_delete_manifest() -> List[ManifestFile]:
|
63
|
+
# Check if we need to mark the files as deleted
|
64
|
+
deleted_entries = self._deleted_entries()
|
65
|
+
if len(deleted_entries) > 0:
|
66
|
+
deleted_manifests = []
|
67
|
+
partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list)
|
68
|
+
for deleted_entry in deleted_entries:
|
69
|
+
partition_groups[deleted_entry.data_file.spec_id].append(
|
70
|
+
deleted_entry
|
71
|
+
)
|
72
|
+
for spec_id, entries in partition_groups.items():
|
73
|
+
with write_manifest(
|
74
|
+
format_version=self._transaction.table_metadata.format_version,
|
75
|
+
spec=self._transaction.table_metadata.specs()[spec_id],
|
76
|
+
schema=self._transaction.table_metadata.schema(),
|
77
|
+
output_file=self.new_manifest_output(),
|
78
|
+
snapshot_id=self._snapshot_id,
|
79
|
+
) as writer:
|
80
|
+
for entry in entries:
|
81
|
+
writer.add_entry(entry)
|
82
|
+
deleted_manifests.append(writer.to_manifest_file())
|
83
|
+
return deleted_manifests
|
84
|
+
else:
|
85
|
+
return []
|
21
86
|
|
22
|
-
|
23
|
-
|
87
|
+
executor = ExecutorFactory.get_or_create()
|
88
|
+
|
89
|
+
added_manifests = executor.submit(_write_added_manifest)
|
90
|
+
existing_manifests = executor.submit(self._existing_manifests)
|
91
|
+
delete_manifests = executor.submit(_write_delete_manifest)
|
92
|
+
return self._process_manifests(
|
93
|
+
added_manifests.result()
|
94
|
+
+ existing_manifests.result()
|
95
|
+
+ delete_manifests.result()
|
96
|
+
)
|
97
|
+
|
98
|
+
def writer_content(self) -> ManifestContent:
|
99
|
+
return ManifestContent.DELETES
|
24
100
|
|
25
101
|
def _existing_manifests(self) -> List[ManifestFile]:
|
26
|
-
"""
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
102
|
+
"""To determine if there are any existing manifest files.
|
103
|
+
|
104
|
+
A fast append will add another ManifestFile to the ManifestList.
|
105
|
+
All the existing manifest files are considered existing.
|
106
|
+
"""
|
107
|
+
existing_manifests = []
|
108
|
+
|
109
|
+
if self._parent_snapshot_id is not None:
|
110
|
+
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
|
111
|
+
self._parent_snapshot_id
|
112
|
+
)
|
113
|
+
|
114
|
+
if previous_snapshot is None:
|
115
|
+
raise ValueError(
|
116
|
+
f"Snapshot could not be found: {self._parent_snapshot_id}"
|
33
117
|
)
|
34
118
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
119
|
+
for manifest in previous_snapshot.manifests(io=self._io):
|
120
|
+
if (
|
121
|
+
manifest.has_added_files()
|
122
|
+
or manifest.has_existing_files()
|
123
|
+
or manifest.added_snapshot_id == self._snapshot_id
|
124
|
+
):
|
125
|
+
existing_manifests.append(manifest)
|
40
126
|
|
41
|
-
|
42
|
-
existing_files.append(manifest_file)
|
43
|
-
else:
|
44
|
-
# We have to replace the manifest file without the deleted data files
|
45
|
-
if any(
|
46
|
-
entry.data_file not in found_deleted_data_files
|
47
|
-
for entry in entries
|
48
|
-
):
|
49
|
-
with write_manifest(
|
50
|
-
format_version=self._transaction.table_metadata.format_version,
|
51
|
-
spec=self._transaction.table_metadata.specs()[
|
52
|
-
manifest_file.partition_spec_id
|
53
|
-
],
|
54
|
-
schema=self._transaction.table_metadata.schema(),
|
55
|
-
output_file=self.new_manifest_output(),
|
56
|
-
snapshot_id=self._snapshot_id,
|
57
|
-
) as writer:
|
58
|
-
[
|
59
|
-
writer.add_entry(
|
60
|
-
ManifestEntry(
|
61
|
-
status=ManifestEntryStatus.EXISTING,
|
62
|
-
snapshot_id=entry.snapshot_id,
|
63
|
-
sequence_number=entry.sequence_number,
|
64
|
-
file_sequence_number=entry.file_sequence_number,
|
65
|
-
data_file=entry.data_file,
|
66
|
-
)
|
67
|
-
)
|
68
|
-
for entry in entries
|
69
|
-
if entry.data_file not in found_deleted_data_files
|
70
|
-
]
|
71
|
-
existing_files.append(writer.to_manifest_file())
|
72
|
-
return existing_files
|
127
|
+
return existing_manifests
|
73
128
|
|
74
129
|
def _deleted_entries(self) -> List[ManifestEntry]:
|
75
|
-
"""To determine if we need to record any deleted entries.
|
76
|
-
|
77
|
-
With a full overwrite all the entries are considered deleted.
|
78
|
-
With partial overwrites we have to use the predicate to evaluate
|
79
|
-
which entries are affected.
|
80
|
-
"""
|
81
130
|
if self._parent_snapshot_id is not None:
|
82
131
|
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
|
83
132
|
self._parent_snapshot_id
|
@@ -102,7 +151,7 @@ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
|
|
102
151
|
for entry in manifest.fetch_manifest_entry(
|
103
152
|
self._io, discard_deleted=True
|
104
153
|
)
|
105
|
-
if entry.data_file.content == DataFileContent.
|
154
|
+
if entry.data_file.content == DataFileContent.EQUALITY_DELETES
|
106
155
|
and entry.data_file in self._deleted_data_files
|
107
156
|
]
|
108
157
|
|
@@ -114,45 +163,30 @@ class _ReplaceFiles(_SnapshotProducer["_ReplaceFiles"]):
|
|
114
163
|
return []
|
115
164
|
|
116
165
|
|
117
|
-
def
|
118
|
-
|
119
|
-
|
120
|
-
using_starting_sequence: Optional[bool] = False,
|
121
|
-
) -> _ReplaceFiles:
|
122
|
-
return _ReplaceFiles(
|
123
|
-
commit_uuid=commit_uuid,
|
124
|
-
operation=Operation.REPLACE
|
125
|
-
if self._transaction.table_metadata.current_snapshot() is not None
|
126
|
-
else Operation.APPEND,
|
127
|
-
transaction=self._transaction,
|
128
|
-
io=self._io,
|
129
|
-
snapshot_properties=self._snapshot_properties,
|
130
|
-
using_starting_sequence=using_starting_sequence,
|
131
|
-
)
|
132
|
-
|
133
|
-
|
134
|
-
UpdateSnapshot.replace = replace
|
135
|
-
|
136
|
-
|
137
|
-
def commit_replace_snapshot(
|
138
|
-
iceberg_table, to_be_deleted_files_list, new_position_delete_files
|
139
|
-
):
|
166
|
+
def commit_append_snapshot(
|
167
|
+
iceberg_table: Table, new_position_delete_files: List[DataFile]
|
168
|
+
) -> TableMetadata:
|
140
169
|
tx = iceberg_table.transaction()
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
170
|
+
try:
|
171
|
+
if iceberg_table.metadata.name_mapping() is None:
|
172
|
+
tx.set_properties(
|
173
|
+
**{
|
174
|
+
"schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
|
175
|
+
}
|
176
|
+
)
|
177
|
+
with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
|
178
|
+
if new_position_delete_files:
|
179
|
+
for data_file in new_position_delete_files:
|
180
|
+
append_snapshot.append_data_file(data_file)
|
181
|
+
except Exception as e:
|
182
|
+
raise e
|
183
|
+
else:
|
184
|
+
return tx.commit_transaction().metadata
|
153
185
|
|
154
186
|
|
155
|
-
def append_delete_files_override(
|
187
|
+
def append_delete_files_override(
|
188
|
+
update_snapshot: UpdateSnapshot,
|
189
|
+
) -> "_AppendDeleteFilesOverride":
|
156
190
|
commit_uuid = uuid.uuid4()
|
157
191
|
return _AppendDeleteFilesOverride(
|
158
192
|
commit_uuid=commit_uuid,
|
@@ -164,8 +198,8 @@ def append_delete_files_override(update_snapshot):
|
|
164
198
|
|
165
199
|
|
166
200
|
class _AppendDeleteFilesOverride(_SnapshotProducer):
|
167
|
-
def _manifests(self):
|
168
|
-
def _write_added_manifest():
|
201
|
+
def _manifests(self) -> List[ManifestFile]:
|
202
|
+
def _write_added_manifest() -> List[ManifestFile]:
|
169
203
|
if self._added_data_files:
|
170
204
|
with write_manifest(
|
171
205
|
format_version=self._transaction.table_metadata.format_version,
|
@@ -198,7 +232,7 @@ class _AppendDeleteFilesOverride(_SnapshotProducer):
|
|
198
232
|
added_manifests.result() + existing_manifests.result()
|
199
233
|
)
|
200
234
|
|
201
|
-
def writer_content(self):
|
235
|
+
def writer_content(self) -> ManifestContent:
|
202
236
|
return ManifestContent.DELETES
|
203
237
|
|
204
238
|
def _existing_manifests(self) -> List[ManifestFile]:
|
@@ -237,15 +271,29 @@ class _AppendDeleteFilesOverride(_SnapshotProducer):
|
|
237
271
|
return []
|
238
272
|
|
239
273
|
|
240
|
-
def
|
241
|
-
|
274
|
+
def commit_replace_snapshot(
|
275
|
+
iceberg_table: Table,
|
276
|
+
new_position_delete_files: List[DataFile],
|
277
|
+
to_be_deleted_files: List[DataFile],
|
278
|
+
) -> TableMetadata:
|
279
|
+
tx = iceberg_table.transaction()
|
280
|
+
try:
|
242
281
|
if iceberg_table.metadata.name_mapping() is None:
|
243
282
|
tx.set_properties(
|
244
283
|
**{
|
245
284
|
"schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
|
246
285
|
}
|
247
286
|
)
|
248
|
-
with
|
287
|
+
with replace_delete_files_override(
|
288
|
+
tx.update_snapshot()
|
289
|
+
) as replace_delete_snapshot:
|
249
290
|
if new_position_delete_files:
|
250
291
|
for data_file in new_position_delete_files:
|
251
|
-
|
292
|
+
replace_delete_snapshot.append_data_file(data_file)
|
293
|
+
if to_be_deleted_files:
|
294
|
+
for delete_file in to_be_deleted_files:
|
295
|
+
replace_delete_snapshot.delete_data_file(delete_file)
|
296
|
+
except Exception as e:
|
297
|
+
raise e
|
298
|
+
else:
|
299
|
+
return tx.commit_transaction().metadata
|