deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
|
|
1
|
+
import argparse
|
2
|
+
import pathlib
|
3
|
+
|
4
|
+
from deltacat.compute import (
|
5
|
+
job_client,
|
6
|
+
JobStatus,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
def run_async(
|
11
|
+
source: str,
|
12
|
+
dest: str,
|
13
|
+
jobs_to_submit: int,
|
14
|
+
job_timeout: int,
|
15
|
+
cloud: str,
|
16
|
+
restart_ray: bool,
|
17
|
+
):
|
18
|
+
# print package version info
|
19
|
+
working_dir = pathlib.Path(__file__).parent
|
20
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
21
|
+
job_number = 0
|
22
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
23
|
+
job_ids = []
|
24
|
+
while jobs_to_submit > 0:
|
25
|
+
jobs_to_submit -= 1
|
26
|
+
job_dest = dest + f".{job_number}"
|
27
|
+
job_id = client.submit_job(
|
28
|
+
# Entrypoint shell command to execute
|
29
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
30
|
+
# Path to the local directory that contains the indexer.py file
|
31
|
+
# This entire directory will be zipped into a job package, so keep
|
32
|
+
# it small.
|
33
|
+
runtime_env={"working_dir": working_dir},
|
34
|
+
)
|
35
|
+
job_ids.append(job_id)
|
36
|
+
job_number += 1
|
37
|
+
|
38
|
+
print("Waiting for all jobs to complete...")
|
39
|
+
job_number = 0
|
40
|
+
all_job_logs = ""
|
41
|
+
for job_id in job_ids:
|
42
|
+
job_status = client.await_job(job_id, timeout_seconds=job_timeout)
|
43
|
+
if job_status != JobStatus.SUCCEEDED:
|
44
|
+
print(f"Job `{job_id}` logs: ")
|
45
|
+
print(client.get_job_logs(job_id))
|
46
|
+
raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
|
47
|
+
all_job_logs += f"\nJob #{job_number} logs: \n"
|
48
|
+
all_job_logs += client.get_job_logs(job_id)
|
49
|
+
job_number += 1
|
50
|
+
print("All jobs completed!")
|
51
|
+
print("Job Logs: ")
|
52
|
+
print(all_job_logs)
|
53
|
+
|
54
|
+
|
55
|
+
def run_sync(
|
56
|
+
source: str,
|
57
|
+
dest: str,
|
58
|
+
jobs_to_submit: int,
|
59
|
+
job_timeout: int,
|
60
|
+
cloud: str,
|
61
|
+
restart_ray: bool,
|
62
|
+
):
|
63
|
+
working_dir = pathlib.Path(__file__).parent
|
64
|
+
cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
|
65
|
+
client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
|
66
|
+
job_number = 0
|
67
|
+
while job_number < jobs_to_submit:
|
68
|
+
job_dest = dest + f".{job_number}"
|
69
|
+
job_run_result = client.run_job(
|
70
|
+
# Entrypoint shell command to execute
|
71
|
+
entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
|
72
|
+
# Path to the local directory that contains the indexer.py file
|
73
|
+
# This entire directory will be zipped into a job package, so keep
|
74
|
+
# it small.
|
75
|
+
runtime_env={"working_dir": working_dir},
|
76
|
+
timeout_seconds=job_timeout,
|
77
|
+
)
|
78
|
+
print(
|
79
|
+
f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
|
80
|
+
)
|
81
|
+
print(f"Job ID {job_run_result.job_id} logs: ")
|
82
|
+
print(job_run_result.job_logs)
|
83
|
+
job_number += 1
|
84
|
+
|
85
|
+
|
86
|
+
def run(
|
87
|
+
source: str,
|
88
|
+
dest: str,
|
89
|
+
restart_ray: bool,
|
90
|
+
jobs_to_submit: int,
|
91
|
+
job_timeout: int,
|
92
|
+
asynchronous: bool,
|
93
|
+
cloud_provider: str,
|
94
|
+
):
|
95
|
+
run_func = run_async if asynchronous else run_sync
|
96
|
+
run_func(
|
97
|
+
source=source,
|
98
|
+
dest=dest,
|
99
|
+
jobs_to_submit=jobs_to_submit,
|
100
|
+
job_timeout=job_timeout,
|
101
|
+
cloud=cloud_provider,
|
102
|
+
restart_ray=restart_ray,
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
if __name__ == "__main__":
|
107
|
+
"""
|
108
|
+
# Run this example through a command of the form:
|
109
|
+
$ python ./deltacat/examples/job_runner.py -- \
|
110
|
+
$ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
|
111
|
+
$ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
|
112
|
+
$ --asynchronous \
|
113
|
+
$ --jobs-to-submit 100 \
|
114
|
+
$ --job-timeout 90 \
|
115
|
+
$ --cloud-provider aws
|
116
|
+
"""
|
117
|
+
script_args = [
|
118
|
+
(
|
119
|
+
[
|
120
|
+
"--source",
|
121
|
+
],
|
122
|
+
{
|
123
|
+
"help": "Source DeltaCAT URL to index.",
|
124
|
+
"type": str,
|
125
|
+
"default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
|
126
|
+
},
|
127
|
+
),
|
128
|
+
(
|
129
|
+
[
|
130
|
+
"--dest",
|
131
|
+
],
|
132
|
+
{
|
133
|
+
"help": "Destination DeltaCAT URL to store the indexed file.",
|
134
|
+
"type": str,
|
135
|
+
"default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
|
136
|
+
},
|
137
|
+
),
|
138
|
+
(
|
139
|
+
[
|
140
|
+
"--restart-ray",
|
141
|
+
],
|
142
|
+
{
|
143
|
+
"help": "Restart Ray on an existing cluster.",
|
144
|
+
"action": "store_true",
|
145
|
+
"default": False,
|
146
|
+
},
|
147
|
+
),
|
148
|
+
(
|
149
|
+
[
|
150
|
+
"--asynchronous",
|
151
|
+
],
|
152
|
+
{
|
153
|
+
"help": "Run jobs asynchronously.",
|
154
|
+
"action": "store_true",
|
155
|
+
"default": False,
|
156
|
+
},
|
157
|
+
),
|
158
|
+
(
|
159
|
+
[
|
160
|
+
"--jobs-to-submit",
|
161
|
+
],
|
162
|
+
{
|
163
|
+
"help": "Number of indexer jobs to submit for execution.",
|
164
|
+
"type": int,
|
165
|
+
"default": 1,
|
166
|
+
},
|
167
|
+
),
|
168
|
+
(
|
169
|
+
[
|
170
|
+
"--job-timeout",
|
171
|
+
],
|
172
|
+
{
|
173
|
+
"help": "Job timeout in seconds.",
|
174
|
+
"type": int,
|
175
|
+
"default": 300,
|
176
|
+
},
|
177
|
+
),
|
178
|
+
(
|
179
|
+
[
|
180
|
+
"--cloud-provider",
|
181
|
+
],
|
182
|
+
{
|
183
|
+
"help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
|
184
|
+
"type": str,
|
185
|
+
"default": "aws",
|
186
|
+
},
|
187
|
+
),
|
188
|
+
]
|
189
|
+
|
190
|
+
# parse CLI input arguments
|
191
|
+
parser = argparse.ArgumentParser()
|
192
|
+
for args, kwargs in script_args:
|
193
|
+
parser.add_argument(*args, **kwargs)
|
194
|
+
args = parser.parse_args()
|
195
|
+
print(f"Command Line Arguments: {args}")
|
196
|
+
|
197
|
+
# run the example using os.environ as kwargs
|
198
|
+
run(**vars(args))
|
deltacat/exceptions.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from enum import Enum
|
3
|
-
from typing import Callable
|
3
|
+
from typing import Callable, Optional, TYPE_CHECKING
|
4
4
|
import logging
|
5
5
|
|
6
6
|
import tenacity
|
@@ -28,6 +28,9 @@ from deltacat.utils.ray_utils.runtime import (
|
|
28
28
|
get_current_ray_task_id,
|
29
29
|
)
|
30
30
|
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from deltacat.storage.model.schema import FieldLocator
|
33
|
+
|
31
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
32
35
|
|
33
36
|
DELTACAT_STORAGE_PARAM = "deltacat_storage"
|
@@ -74,9 +77,18 @@ class DeltaCatErrorNames(str, Enum):
|
|
74
77
|
TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
|
75
78
|
TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
|
76
79
|
STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
|
80
|
+
PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
|
77
81
|
DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
|
78
82
|
TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
|
83
|
+
TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
|
79
84
|
NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
|
85
|
+
SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
|
86
|
+
SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
|
87
|
+
TABLE_VALIDATION_ERROR = "TableValidationError"
|
88
|
+
CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
|
89
|
+
OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
|
90
|
+
OBJECT_DELETED_ERROR = "ObjectDeletedError"
|
91
|
+
OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
|
80
92
|
|
81
93
|
|
82
94
|
class DeltaCatError(Exception):
|
@@ -87,9 +99,12 @@ class DeltaCatError(Exception):
|
|
87
99
|
super().__init__(*args, **kwargs)
|
88
100
|
|
89
101
|
def _get_ray_task_id_and_node_ip(self):
|
90
|
-
|
91
|
-
|
92
|
-
|
102
|
+
if ray.is_initialized():
|
103
|
+
task_id = get_current_ray_task_id()
|
104
|
+
node_ip = ray.util.get_node_ip_address()
|
105
|
+
return task_id, node_ip
|
106
|
+
else:
|
107
|
+
return None, None
|
93
108
|
|
94
109
|
|
95
110
|
class NonRetryableError(DeltaCatError):
|
@@ -232,6 +247,10 @@ class TableVersionNotFoundError(NonRetryableError):
|
|
232
247
|
error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
|
233
248
|
|
234
249
|
|
250
|
+
class PartitionNotFoundError(NonRetryableError):
|
251
|
+
error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
|
252
|
+
|
253
|
+
|
235
254
|
class StreamNotFoundError(NonRetryableError):
|
236
255
|
error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
|
237
256
|
|
@@ -244,10 +263,53 @@ class TableAlreadyExistsError(NonRetryableError):
|
|
244
263
|
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
245
264
|
|
246
265
|
|
266
|
+
class TableVersionAlreadyExistsError(NonRetryableError):
|
267
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
|
268
|
+
|
269
|
+
|
247
270
|
class NamespaceAlreadyExistsError(NonRetryableError):
|
248
271
|
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
249
272
|
|
250
273
|
|
274
|
+
class ObjectNotFoundError(NonRetryableError):
|
275
|
+
error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
|
276
|
+
|
277
|
+
|
278
|
+
class ObjectDeletedError(NonRetryableError):
|
279
|
+
error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
|
280
|
+
|
281
|
+
|
282
|
+
class ObjectAlreadyExistsError(NonRetryableError):
|
283
|
+
error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
|
284
|
+
|
285
|
+
|
286
|
+
class ConcurrentModificationError(NonRetryableError):
|
287
|
+
error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
|
288
|
+
|
289
|
+
|
290
|
+
class SchemaValidationError(NonRetryableError):
|
291
|
+
error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
|
292
|
+
|
293
|
+
|
294
|
+
class TableValidationError(NonRetryableError):
|
295
|
+
error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
|
296
|
+
|
297
|
+
|
298
|
+
class SchemaCompatibilityError(NonRetryableError):
|
299
|
+
error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
|
300
|
+
"""Raised when a schema update would break backward compatibility."""
|
301
|
+
|
302
|
+
def __init__(
|
303
|
+
self,
|
304
|
+
message: str,
|
305
|
+
field_locator: Optional[FieldLocator] = None,
|
306
|
+
*args,
|
307
|
+
**kwargs,
|
308
|
+
):
|
309
|
+
super().__init__(message, *args, **kwargs)
|
310
|
+
self.field_locator = field_locator
|
311
|
+
|
312
|
+
|
251
313
|
def categorize_errors(func: Callable):
|
252
314
|
def wrapper(*args, **kwargs):
|
253
315
|
try:
|
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
|
|
15
15
|
|
16
16
|
This configuration is passed through to PyIceberg by invoking load_catalog.
|
17
17
|
The Properties provided must match properties accepted by PyIceberg for each catalog type
|
18
|
-
See: :func:`deltacat.catalog.iceberg.initialize`
|
18
|
+
See: :func:`deltacat.experimental.catalog.iceberg.initialize`
|
19
19
|
|
20
20
|
Attributes:
|
21
21
|
type: The PyIceberg Catalog instance
|
@@ -1,16 +1,26 @@
|
|
1
1
|
import logging
|
2
|
+
import sys
|
2
3
|
|
3
4
|
from typing import Any, Dict, List, Optional, Union
|
4
5
|
|
5
|
-
from daft import DataFrame
|
6
|
+
from daft import DataFrame, context
|
7
|
+
from daft.daft import ScanOperatorHandle, StorageConfig
|
8
|
+
from daft.logical.builder import LogicalPlanBuilder
|
6
9
|
|
7
10
|
from deltacat import logs
|
11
|
+
from deltacat.catalog.model.catalog import Catalog
|
8
12
|
from deltacat.catalog.model.table_definition import TableDefinition
|
13
|
+
from deltacat.utils.daft import DeltaCatScanOperator
|
9
14
|
from deltacat.exceptions import TableAlreadyExistsError
|
10
|
-
from deltacat.storage.iceberg.iceberg_scan_planner import
|
11
|
-
|
15
|
+
from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
|
16
|
+
IcebergScanPlanner,
|
17
|
+
)
|
18
|
+
from deltacat.experimental.storage.iceberg.model import (
|
19
|
+
PartitionSchemeMapper,
|
20
|
+
SchemaMapper,
|
21
|
+
)
|
12
22
|
from deltacat.storage.model.partition import PartitionScheme
|
13
|
-
from deltacat.storage.iceberg.impl import _get_native_catalog
|
23
|
+
from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
|
14
24
|
from deltacat.storage.model.sort_key import SortScheme
|
15
25
|
from deltacat.storage.model.list_result import ListResult
|
16
26
|
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
@@ -23,20 +33,31 @@ from deltacat.storage.model.types import (
|
|
23
33
|
LocalTable,
|
24
34
|
StreamFormat,
|
25
35
|
)
|
26
|
-
from deltacat.storage.iceberg import impl as IcebergStorage
|
36
|
+
from deltacat.experimental.storage.iceberg import impl as IcebergStorage
|
27
37
|
from deltacat.types.media import ContentType
|
28
38
|
from deltacat.types.tables import TableWriteMode
|
29
39
|
from deltacat.constants import DEFAULT_NAMESPACE
|
30
|
-
from deltacat.catalog.iceberg.iceberg_catalog_config import
|
40
|
+
from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
|
41
|
+
IcebergCatalogConfig,
|
42
|
+
)
|
31
43
|
|
32
|
-
from pyiceberg.catalog import Catalog, load_catalog
|
44
|
+
from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
|
33
45
|
from pyiceberg.transforms import BucketTransform
|
34
46
|
|
35
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
48
|
|
49
|
+
IcebergCatalog = sys.modules[__name__]
|
50
|
+
|
51
|
+
|
52
|
+
def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
|
53
|
+
"""
|
54
|
+
Factory method to construct a catalog from Iceberg catalog configuration.
|
55
|
+
"""
|
56
|
+
return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
|
57
|
+
|
37
58
|
|
38
59
|
# catalog functions
|
39
|
-
def initialize(
|
60
|
+
def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
|
40
61
|
"""
|
41
62
|
Initializes an Iceberg catalog with the given config.
|
42
63
|
|
@@ -120,7 +141,7 @@ def write_to_table(
|
|
120
141
|
)
|
121
142
|
# TODO(pdames): only append s3:// to output file paths when writing to S3!
|
122
143
|
out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
|
123
|
-
from deltacat.catalog.iceberg import overrides
|
144
|
+
from deltacat.experimental.catalog.iceberg import overrides
|
124
145
|
|
125
146
|
overrides.append(
|
126
147
|
table_definition.table.native_object,
|
@@ -144,7 +165,17 @@ def read_table(
|
|
144
165
|
table: str, *args, namespace: Optional[str] = None, **kwargs
|
145
166
|
) -> DistributedDataset:
|
146
167
|
"""Read a table into a distributed dataset."""
|
147
|
-
|
168
|
+
# TODO: more proper IO configuration
|
169
|
+
io_config = context.get_context().daft_planning_config.default_io_config
|
170
|
+
multithreaded_io = context.get_context().get_or_create_runner().name != "ray"
|
171
|
+
|
172
|
+
storage_config = StorageConfig(multithreaded_io, io_config)
|
173
|
+
|
174
|
+
dc_table = get_table(name=table, namespace=namespace, **kwargs)
|
175
|
+
dc_scan_operator = DeltaCatScanOperator(dc_table, storage_config)
|
176
|
+
handle = ScanOperatorHandle.from_python_scan_operator(dc_scan_operator)
|
177
|
+
builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
|
178
|
+
return DataFrame(builder)
|
148
179
|
|
149
180
|
|
150
181
|
def alter_table(
|
@@ -167,7 +198,7 @@ def create_table(
|
|
167
198
|
name: str,
|
168
199
|
*args,
|
169
200
|
namespace: Optional[str] = None,
|
170
|
-
|
201
|
+
table_version: Optional[str] = None,
|
171
202
|
lifecycle_state: Optional[LifecycleState] = None,
|
172
203
|
schema: Optional[Schema] = None,
|
173
204
|
partition_scheme: Optional[PartitionScheme] = None,
|
@@ -211,7 +242,7 @@ def create_table(
|
|
211
242
|
IcebergStorage.create_table_version(
|
212
243
|
namespace=namespace,
|
213
244
|
table_name=name,
|
214
|
-
table_version=
|
245
|
+
table_version=table_version,
|
215
246
|
schema=schema,
|
216
247
|
partition_scheme=partition_scheme,
|
217
248
|
sort_keys=sort_keys,
|
@@ -5,12 +5,11 @@ from typing import Iterator, List
|
|
5
5
|
from pyarrow.fs import FileSystem
|
6
6
|
|
7
7
|
from pyiceberg.io.pyarrow import (
|
8
|
-
|
8
|
+
data_file_statistics_from_parquet_metadata,
|
9
9
|
compute_statistics_plan,
|
10
10
|
parquet_path_to_id_mapping,
|
11
11
|
)
|
12
|
-
from pyiceberg.table import Table
|
13
|
-
from pyiceberg.table.snapshots import Operation
|
12
|
+
from pyiceberg.table import Table
|
14
13
|
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
15
14
|
from pyiceberg.types import StructType, NestedField, IntegerType
|
16
15
|
from pyiceberg.typedef import Record
|
@@ -24,11 +23,10 @@ def append(table: Table, paths: List[str]) -> None:
|
|
24
23
|
# raise ValueError("Cannot write to tables with a sort-order")
|
25
24
|
|
26
25
|
data_files = write_file(table, paths)
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
merge.commit()
|
26
|
+
with table.transaction() as txn:
|
27
|
+
with txn.update_snapshot().fast_append() as snapshot_update:
|
28
|
+
for data_file in data_files:
|
29
|
+
snapshot_update.append_data_file(data_file)
|
32
30
|
|
33
31
|
|
34
32
|
def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
|
@@ -41,6 +39,11 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
|
|
41
39
|
fs_path = fs_tuple[1]
|
42
40
|
with fs.open_input_file(fs_path) as native_file:
|
43
41
|
parquet_metadata = pq.read_metadata(native_file)
|
42
|
+
statistics = data_file_statistics_from_parquet_metadata(
|
43
|
+
parquet_metadata=parquet_metadata,
|
44
|
+
stats_columns=compute_statistics_plan(table.schema(), table.properties),
|
45
|
+
parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
|
46
|
+
)
|
44
47
|
data_file = DataFile(
|
45
48
|
content=DataFileContent.DATA,
|
46
49
|
file_path=file_path,
|
@@ -63,12 +66,7 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
|
|
63
66
|
spec_id=table.spec().spec_id,
|
64
67
|
equality_ids=None,
|
65
68
|
key_metadata=None,
|
66
|
-
|
67
|
-
fill_parquet_file_metadata(
|
68
|
-
data_file=data_file,
|
69
|
-
parquet_metadata=parquet_metadata,
|
70
|
-
stats_columns=compute_statistics_plan(table.schema(), table.properties),
|
71
|
-
parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
|
69
|
+
**statistics.to_serialized_dict(),
|
72
70
|
)
|
73
71
|
data_files.append(data_file)
|
74
72
|
return data_files
|