deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
"""
|
2
|
+
Spark SQL utilities for Iceberg table operations.
|
3
|
+
|
4
|
+
This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import apache_beam as beam
|
9
|
+
from apache_beam import Row
|
10
|
+
|
11
|
+
|
12
|
+
class SparkSQLIcebergRead(beam.DoFn):
|
13
|
+
"""
|
14
|
+
Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
table_name: str,
|
20
|
+
catalog_uri: str = "http://localhost:8181",
|
21
|
+
warehouse: str = "warehouse/",
|
22
|
+
):
|
23
|
+
"""
|
24
|
+
Initialize the Spark SQL reader.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
table_name: Name of the Iceberg table
|
28
|
+
catalog_uri: URI of the Iceberg REST catalog
|
29
|
+
warehouse: Warehouse path
|
30
|
+
"""
|
31
|
+
self.table_name = table_name
|
32
|
+
self.catalog_uri = catalog_uri
|
33
|
+
self.warehouse = warehouse
|
34
|
+
self.spark = None
|
35
|
+
|
36
|
+
def setup(self):
|
37
|
+
"""Set up Spark session (called once per worker)."""
|
38
|
+
try:
|
39
|
+
from pyspark.sql import SparkSession
|
40
|
+
import importlib.metadata
|
41
|
+
|
42
|
+
# Get Spark version for dependency resolution
|
43
|
+
try:
|
44
|
+
spark_version = ".".join(
|
45
|
+
importlib.metadata.version("pyspark").split(".")[:2]
|
46
|
+
)
|
47
|
+
except Exception:
|
48
|
+
spark_version = "3.5" # Default fallback
|
49
|
+
|
50
|
+
scala_version = "2.12"
|
51
|
+
iceberg_version = "1.6.0"
|
52
|
+
|
53
|
+
print(f"🔧 Setting up Spark session for reading {self.table_name}")
|
54
|
+
print(f" - Spark version: {spark_version}")
|
55
|
+
print(f" - Iceberg version: {iceberg_version}")
|
56
|
+
|
57
|
+
# Set Spark packages for Iceberg runtime
|
58
|
+
os.environ["PYSPARK_SUBMIT_ARGS"] = (
|
59
|
+
f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
|
60
|
+
f"pyspark-shell"
|
61
|
+
)
|
62
|
+
|
63
|
+
# Create Spark session with Iceberg REST catalog configuration
|
64
|
+
self.spark = (
|
65
|
+
SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
|
66
|
+
.config("spark.sql.session.timeZone", "UTC")
|
67
|
+
.config(
|
68
|
+
"spark.serializer", "org.apache.spark.serializer.KryoSerializer"
|
69
|
+
)
|
70
|
+
.config(
|
71
|
+
"spark.sql.extensions",
|
72
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
73
|
+
)
|
74
|
+
# Configure REST catalog
|
75
|
+
.config(
|
76
|
+
"spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
|
77
|
+
)
|
78
|
+
.config("spark.sql.catalog.rest.type", "rest")
|
79
|
+
.config("spark.sql.catalog.rest.uri", self.catalog_uri)
|
80
|
+
.config("spark.sql.catalog.rest.warehouse", self.warehouse)
|
81
|
+
# Set REST as default catalog
|
82
|
+
.config("spark.sql.defaultCatalog", "rest")
|
83
|
+
# Local mode configuration (within Beam workers)
|
84
|
+
.config("spark.master", "local[1]") # Single thread per worker
|
85
|
+
.config("spark.sql.adaptive.enabled", "true")
|
86
|
+
# Networking binding
|
87
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
88
|
+
.config("spark.driver.host", "127.0.0.1")
|
89
|
+
.config("spark.ui.enabled", "false")
|
90
|
+
.config("spark.sql.adaptive.coalescePartitions.enabled", "false")
|
91
|
+
.getOrCreate()
|
92
|
+
)
|
93
|
+
|
94
|
+
print(f"✅ Spark session created successfully")
|
95
|
+
|
96
|
+
except Exception as e:
|
97
|
+
print(f"❌ Failed to set up Spark session: {e}")
|
98
|
+
raise
|
99
|
+
|
100
|
+
def teardown(self):
|
101
|
+
"""Clean up Spark session (called once per worker)."""
|
102
|
+
if self.spark:
|
103
|
+
try:
|
104
|
+
self.spark.stop()
|
105
|
+
print("✅ Spark session stopped")
|
106
|
+
except Exception as e:
|
107
|
+
print(f"⚠️ Error stopping Spark session: {e}")
|
108
|
+
|
109
|
+
def process(self, element):
|
110
|
+
"""
|
111
|
+
Process element (read from Iceberg table using Spark SQL).
|
112
|
+
|
113
|
+
Args:
|
114
|
+
element: Input element (not used, just triggers the read)
|
115
|
+
|
116
|
+
Yields:
|
117
|
+
Records from the Iceberg table
|
118
|
+
"""
|
119
|
+
try:
|
120
|
+
if not self.spark:
|
121
|
+
raise RuntimeError("Spark session not initialized")
|
122
|
+
|
123
|
+
print(f"📖 Reading table {self.table_name} using Spark SQL")
|
124
|
+
|
125
|
+
# Read from Iceberg table using Spark SQL
|
126
|
+
df = self.spark.sql(f"SELECT * FROM {self.table_name}")
|
127
|
+
|
128
|
+
# Collect all records
|
129
|
+
records = df.collect()
|
130
|
+
|
131
|
+
print(f"📊 Successfully read {len(records)} records from {self.table_name}")
|
132
|
+
|
133
|
+
# Convert Spark rows to Beam Row objects and yield
|
134
|
+
for row in records:
|
135
|
+
row_dict = row.asDict()
|
136
|
+
# Convert to Beam Row for consistency with write mode
|
137
|
+
beam_row = Row(**row_dict)
|
138
|
+
yield beam_row
|
139
|
+
|
140
|
+
except Exception as e:
|
141
|
+
print(f"❌ Failed to read from table {self.table_name}: {e}")
|
142
|
+
raise
|
143
|
+
|
144
|
+
|
145
|
+
class SparkSQLIcebergRewrite(beam.DoFn):
|
146
|
+
"""
|
147
|
+
Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
|
148
|
+
|
149
|
+
This uses Spark's rewrite_data_files procedure to materialize positional deletes
|
150
|
+
by rewriting data files. The result is a "clean" table without positional deletes.
|
151
|
+
"""
|
152
|
+
|
153
|
+
def __init__(self, catalog_uri, warehouse_path, table_name):
|
154
|
+
self.catalog_uri = catalog_uri
|
155
|
+
self.warehouse_path = warehouse_path
|
156
|
+
self.table_name = table_name
|
157
|
+
|
158
|
+
def setup(self):
|
159
|
+
"""Initialize Spark session for rewrite operations."""
|
160
|
+
try:
|
161
|
+
from pyspark.sql import SparkSession
|
162
|
+
import importlib.metadata
|
163
|
+
|
164
|
+
print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
|
165
|
+
|
166
|
+
# Detect Spark version for appropriate Iceberg runtime
|
167
|
+
spark_version = importlib.metadata.version("pyspark")
|
168
|
+
major_minor = ".".join(spark_version.split(".")[:2])
|
169
|
+
print(f" - Spark version: {major_minor}")
|
170
|
+
print(f" - Iceberg version: 1.6.0")
|
171
|
+
|
172
|
+
# Configure Spark with Iceberg
|
173
|
+
self.spark = (
|
174
|
+
SparkSession.builder.appName("IcebergRewrite")
|
175
|
+
.config(
|
176
|
+
"spark.jars.packages",
|
177
|
+
f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
|
178
|
+
)
|
179
|
+
.config(
|
180
|
+
"spark.sql.extensions",
|
181
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
182
|
+
)
|
183
|
+
.config(
|
184
|
+
"spark.sql.catalog.spark_catalog",
|
185
|
+
"org.apache.iceberg.spark.SparkSessionCatalog",
|
186
|
+
)
|
187
|
+
.config("spark.sql.catalog.spark_catalog.type", "rest")
|
188
|
+
.config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
|
189
|
+
.config(
|
190
|
+
"spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
|
191
|
+
)
|
192
|
+
.config("spark.driver.bindAddress", "127.0.0.1")
|
193
|
+
.config("spark.driver.host", "127.0.0.1")
|
194
|
+
.config("spark.ui.enabled", "false")
|
195
|
+
.getOrCreate()
|
196
|
+
)
|
197
|
+
|
198
|
+
print("✅ Spark session created successfully")
|
199
|
+
|
200
|
+
except ImportError as e:
|
201
|
+
raise RuntimeError(
|
202
|
+
f"PySpark is required for rewrite mode. Install with: pip install pyspark"
|
203
|
+
) from e
|
204
|
+
except Exception as e:
|
205
|
+
raise RuntimeError(f"Failed to create Spark session: {e}") from e
|
206
|
+
|
207
|
+
def process(self, element):
|
208
|
+
"""Rewrite table data files to materialize positional deletes."""
|
209
|
+
try:
|
210
|
+
print(
|
211
|
+
f"📋 Rewriting table {self.table_name} to materialize positional deletes"
|
212
|
+
)
|
213
|
+
|
214
|
+
# Use Spark's rewrite_data_files procedure with delete_file_threshold=1
|
215
|
+
# This forces rewrite even when there's only 1 positional delete file
|
216
|
+
rewrite_sql = f"""
|
217
|
+
CALL spark_catalog.system.rewrite_data_files(
|
218
|
+
table => '{self.table_name}',
|
219
|
+
options => map('delete-file-threshold', '1')
|
220
|
+
)
|
221
|
+
"""
|
222
|
+
|
223
|
+
print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
|
224
|
+
print(f" SQL: {rewrite_sql.strip()}")
|
225
|
+
print(
|
226
|
+
f" Rationale: Forces rewrite even with single positional delete file"
|
227
|
+
)
|
228
|
+
|
229
|
+
result = self.spark.sql(rewrite_sql)
|
230
|
+
|
231
|
+
# Collect results to see what was rewritten
|
232
|
+
rewrite_result = result.collect()[0]
|
233
|
+
print(f"📊 Rewrite result: {rewrite_result}")
|
234
|
+
|
235
|
+
# Check if we actually rewrote anything
|
236
|
+
if rewrite_result.rewritten_data_files_count > 0:
|
237
|
+
print(
|
238
|
+
f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
|
239
|
+
)
|
240
|
+
print(
|
241
|
+
f" - Added {rewrite_result.added_data_files_count} new data files"
|
242
|
+
)
|
243
|
+
print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
|
244
|
+
print(f" - Positional deletes have been materialized!")
|
245
|
+
else:
|
246
|
+
print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
|
247
|
+
print(f" - This may indicate no positional deletes exist")
|
248
|
+
print(f" - Or the table may already be in optimal state")
|
249
|
+
|
250
|
+
yield f"Rewrite completed for {self.table_name}"
|
251
|
+
|
252
|
+
except Exception as e:
|
253
|
+
print(f"❌ Error during rewrite: {e}")
|
254
|
+
import traceback
|
255
|
+
|
256
|
+
traceback.print_exc()
|
257
|
+
yield f"Rewrite failed for {self.table_name}: {e}"
|
258
|
+
|
259
|
+
def teardown(self):
|
260
|
+
"""Clean up Spark session."""
|
261
|
+
if hasattr(self, "spark"):
|
262
|
+
print("✅ Spark session stopped")
|
263
|
+
self.spark.stop()
|
@@ -9,10 +9,8 @@ import deltacat as dc
|
|
9
9
|
|
10
10
|
from deltacat import logs
|
11
11
|
from deltacat import IcebergCatalog
|
12
|
-
from deltacat.catalog.iceberg import IcebergCatalogConfig
|
13
|
-
from
|
14
|
-
store_cli_args_in_os_environ,
|
15
|
-
)
|
12
|
+
from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
|
13
|
+
from env import store_cli_args_in_os_environ
|
16
14
|
|
17
15
|
from pyiceberg.schema import (
|
18
16
|
Schema,
|
@@ -23,7 +21,7 @@ from pyiceberg.schema import (
|
|
23
21
|
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
24
22
|
from pyiceberg.transforms import BucketTransform
|
25
23
|
|
26
|
-
from deltacat.storage.iceberg.model import (
|
24
|
+
from deltacat.experimental.storage.iceberg.model import (
|
27
25
|
SchemaMapper,
|
28
26
|
PartitionSchemeMapper,
|
29
27
|
)
|
@@ -4,9 +4,7 @@ import deltacat as dc
|
|
4
4
|
|
5
5
|
from deltacat import logs
|
6
6
|
from deltacat import IcebergCatalog
|
7
|
-
from
|
8
|
-
store_cli_args_in_os_environ,
|
9
|
-
)
|
7
|
+
from env import store_cli_args_in_os_environ
|
10
8
|
|
11
9
|
from pyiceberg.schema import (
|
12
10
|
Schema,
|
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
|
|
22
20
|
from pyiceberg.table.sorting import SortField, SortOrder
|
23
21
|
|
24
22
|
from deltacat.exceptions import TableAlreadyExistsError
|
25
|
-
from deltacat.storage.iceberg.model import (
|
23
|
+
from deltacat.experimental.storage.iceberg.model import (
|
26
24
|
SchemaMapper,
|
27
25
|
PartitionSchemeMapper,
|
28
26
|
SortSchemeMapper,
|
@@ -59,8 +59,8 @@ def run(
|
|
59
59
|
"use_pyarrow": True, # use the native pyarrow reader
|
60
60
|
},
|
61
61
|
# writer arguments to pass to the default writer (polars)
|
62
|
-
# for the given parquet-based datasink, it accepts the same
|
63
|
-
# arguments as polars.DataFrame.
|
62
|
+
# for the given parquet-based datasink, it generally accepts the same
|
63
|
+
# arguments as polars.DataFrame.write_{dest-type} except for `file`
|
64
64
|
writer_args={
|
65
65
|
"compression": "lz4", # faster compression & decompression
|
66
66
|
# "compression": "zstd", # better compression ratio
|
@@ -64,8 +64,7 @@ def run_sync(
|
|
64
64
|
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
65
65
|
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
66
66
|
job_number = 0
|
67
|
-
while
|
68
|
-
jobs_to_submit -= 1
|
67
|
+
while job_number < jobs_to_submit:
|
69
68
|
job_dest = dest + f".{job_number}"
|
70
69
|
job_run_result = client.run_job(
|
71
70
|
# Entrypoint shell command to execute
|
deltacat/exceptions.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Callable
|
3
|
+
from typing import Callable, Optional, TYPE_CHECKING
|
4
4
|
import logging
|
5
5
|
|
6
6
|
import tenacity
|
@@ -28,6 +28,9 @@ from deltacat.utils.ray_utils.runtime import (
|
|
28
28
|
get_current_ray_task_id,
|
29
29
|
)
|
30
30
|
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from deltacat.storage.model.schema import FieldLocator
|
33
|
+
|
31
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
32
35
|
|
33
36
|
DELTACAT_STORAGE_PARAM = "deltacat_storage"
|
@@ -74,9 +77,18 @@ class DeltaCatErrorNames(str, Enum):
|
|
74
77
|
TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
|
75
78
|
TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
|
76
79
|
STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
|
80
|
+
PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
|
77
81
|
DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
|
78
82
|
TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
|
83
|
+
TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
|
79
84
|
NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
|
85
|
+
SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
|
86
|
+
SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
|
87
|
+
TABLE_VALIDATION_ERROR = "TableValidationError"
|
88
|
+
CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
|
89
|
+
OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
|
90
|
+
OBJECT_DELETED_ERROR = "ObjectDeletedError"
|
91
|
+
OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
|
80
92
|
|
81
93
|
|
82
94
|
class DeltaCatError(Exception):
|
@@ -87,9 +99,12 @@ class DeltaCatError(Exception):
|
|
87
99
|
super().__init__(*args, **kwargs)
|
88
100
|
|
89
101
|
def _get_ray_task_id_and_node_ip(self):
|
90
|
-
|
91
|
-
|
92
|
-
|
102
|
+
if ray.is_initialized():
|
103
|
+
task_id = get_current_ray_task_id()
|
104
|
+
node_ip = ray.util.get_node_ip_address()
|
105
|
+
return task_id, node_ip
|
106
|
+
else:
|
107
|
+
return None, None
|
93
108
|
|
94
109
|
|
95
110
|
class NonRetryableError(DeltaCatError):
|
@@ -232,6 +247,10 @@ class TableVersionNotFoundError(NonRetryableError):
|
|
232
247
|
error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
|
233
248
|
|
234
249
|
|
250
|
+
class PartitionNotFoundError(NonRetryableError):
|
251
|
+
error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
|
252
|
+
|
253
|
+
|
235
254
|
class StreamNotFoundError(NonRetryableError):
|
236
255
|
error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
|
237
256
|
|
@@ -244,10 +263,53 @@ class TableAlreadyExistsError(NonRetryableError):
|
|
244
263
|
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
245
264
|
|
246
265
|
|
266
|
+
class TableVersionAlreadyExistsError(NonRetryableError):
|
267
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
|
268
|
+
|
269
|
+
|
247
270
|
class NamespaceAlreadyExistsError(NonRetryableError):
|
248
271
|
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
249
272
|
|
250
273
|
|
274
|
+
class ObjectNotFoundError(NonRetryableError):
|
275
|
+
error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
|
276
|
+
|
277
|
+
|
278
|
+
class ObjectDeletedError(NonRetryableError):
|
279
|
+
error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
|
280
|
+
|
281
|
+
|
282
|
+
class ObjectAlreadyExistsError(NonRetryableError):
|
283
|
+
error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
|
284
|
+
|
285
|
+
|
286
|
+
class ConcurrentModificationError(NonRetryableError):
|
287
|
+
error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
|
288
|
+
|
289
|
+
|
290
|
+
class SchemaValidationError(NonRetryableError):
|
291
|
+
error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
|
292
|
+
|
293
|
+
|
294
|
+
class TableValidationError(NonRetryableError):
|
295
|
+
error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
|
296
|
+
|
297
|
+
|
298
|
+
class SchemaCompatibilityError(NonRetryableError):
|
299
|
+
error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
|
300
|
+
"""Raised when a schema update would break backward compatibility."""
|
301
|
+
|
302
|
+
def __init__(
|
303
|
+
self,
|
304
|
+
message: str,
|
305
|
+
field_locator: Optional[FieldLocator] = None,
|
306
|
+
*args,
|
307
|
+
**kwargs,
|
308
|
+
):
|
309
|
+
super().__init__(message, *args, **kwargs)
|
310
|
+
self.field_locator = field_locator
|
311
|
+
|
312
|
+
|
251
313
|
def categorize_errors(func: Callable):
|
252
314
|
def wrapper(*args, **kwargs):
|
253
315
|
try:
|
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
|
|
15
15
|
|
16
16
|
This configuration is passed through to PyIceberg by invoking load_catalog.
|
17
17
|
The Properties provided must match properties accepted by PyIceberg for each catalog type
|
18
|
-
See: :func:`deltacat.catalog.iceberg.initialize`
|
18
|
+
See: :func:`deltacat.experimental.catalog.iceberg.initialize`
|
19
19
|
|
20
20
|
Attributes:
|
21
21
|
type: The PyIceberg Catalog instance
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import sys
|
2
3
|
|
3
4
|
from typing import Any, Dict, List, Optional, Union
|
4
5
|
|
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
|
|
7
8
|
from daft.logical.builder import LogicalPlanBuilder
|
8
9
|
|
9
10
|
from deltacat import logs
|
11
|
+
from deltacat.catalog.model.catalog import Catalog
|
10
12
|
from deltacat.catalog.model.table_definition import TableDefinition
|
11
|
-
from deltacat.daft
|
13
|
+
from deltacat.utils.daft import DeltaCatScanOperator
|
12
14
|
from deltacat.exceptions import TableAlreadyExistsError
|
13
|
-
from deltacat.storage.iceberg.iceberg_scan_planner import
|
14
|
-
|
15
|
+
from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
|
16
|
+
IcebergScanPlanner,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.iceberg.model import (
|
19
|
+
PartitionSchemeMapper,
|
20
|
+
SchemaMapper,
|
21
|
+
)
|
15
22
|
from deltacat.storage.model.partition import PartitionScheme
|
16
|
-
from deltacat.storage.iceberg.impl import _get_native_catalog
|
23
|
+
from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
|
17
24
|
from deltacat.storage.model.sort_key import SortScheme
|
18
25
|
from deltacat.storage.model.list_result import ListResult
|
19
26
|
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
|
|
26
33
|
LocalTable,
|
27
34
|
StreamFormat,
|
28
35
|
)
|
29
|
-
from deltacat.storage.iceberg import impl as IcebergStorage
|
36
|
+
from deltacat.experimental.storage.iceberg import impl as IcebergStorage
|
30
37
|
from deltacat.types.media import ContentType
|
31
38
|
from deltacat.types.tables import TableWriteMode
|
32
39
|
from deltacat.constants import DEFAULT_NAMESPACE
|
33
|
-
from deltacat.catalog.iceberg.iceberg_catalog_config import
|
40
|
+
from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
|
41
|
+
IcebergCatalogConfig,
|
42
|
+
)
|
34
43
|
|
35
|
-
from pyiceberg.catalog import Catalog, load_catalog
|
44
|
+
from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
|
36
45
|
from pyiceberg.transforms import BucketTransform
|
37
46
|
|
38
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
39
48
|
|
49
|
+
IcebergCatalog = sys.modules[__name__]
|
50
|
+
|
51
|
+
|
52
|
+
def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
|
53
|
+
"""
|
54
|
+
Factory method to construct a catalog from Iceberg catalog configuration.
|
55
|
+
"""
|
56
|
+
return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
|
57
|
+
|
40
58
|
|
41
59
|
# catalog functions
|
42
|
-
def initialize(
|
60
|
+
def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
|
43
61
|
"""
|
44
62
|
Initializes an Iceberg catalog with the given config.
|
45
63
|
|
@@ -123,7 +141,7 @@ def write_to_table(
|
|
123
141
|
)
|
124
142
|
# TODO(pdames): only append s3:// to output file paths when writing to S3!
|
125
143
|
out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
|
126
|
-
from deltacat.catalog.iceberg import overrides
|
144
|
+
from deltacat.experimental.catalog.iceberg import overrides
|
127
145
|
|
128
146
|
overrides.append(
|
129
147
|
table_definition.table.native_object,
|
@@ -180,7 +198,7 @@ def create_table(
|
|
180
198
|
name: str,
|
181
199
|
*args,
|
182
200
|
namespace: Optional[str] = None,
|
183
|
-
|
201
|
+
table_version: Optional[str] = None,
|
184
202
|
lifecycle_state: Optional[LifecycleState] = None,
|
185
203
|
schema: Optional[Schema] = None,
|
186
204
|
partition_scheme: Optional[PartitionScheme] = None,
|
@@ -224,7 +242,7 @@ def create_table(
|
|
224
242
|
IcebergStorage.create_table_version(
|
225
243
|
namespace=namespace,
|
226
244
|
table_name=name,
|
227
|
-
table_version=
|
245
|
+
table_version=table_version,
|
228
246
|
schema=schema,
|
229
247
|
partition_scheme=partition_scheme,
|
230
248
|
sort_keys=sort_keys,
|