deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
import math
|
2
|
+
from random import shuffle
|
3
|
+
import pytest
|
4
|
+
from deltacat.storage.rivulet.dataset import Dataset
|
5
|
+
from deltacat.storage.rivulet.schema.datatype import Datatype
|
6
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
7
|
+
from deltacat.storage.rivulet.schema.schema import Schema
|
8
|
+
from deltacat.benchmarking.benchmark_engine import BenchmarkEngine
|
9
|
+
from deltacat.benchmarking.benchmark_report import BenchmarkRun, BenchmarkReport
|
10
|
+
from deltacat.benchmarking.benchmark_suite import BenchmarkSuite
|
11
|
+
from deltacat.benchmarking.data.random_row_generator import RandomRowGenerator
|
12
|
+
from deltacat.benchmarking.data.row_generator import RowGenerator
|
13
|
+
from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
|
14
|
+
|
15
|
+
pytestmark = pytest.mark.benchmark
|
16
|
+
|
17
|
+
|
18
|
+
@pytest.fixture
|
19
|
+
def schema():
|
20
|
+
return Schema(
|
21
|
+
[
|
22
|
+
("id", Datatype.int32()),
|
23
|
+
("source", Datatype.string()),
|
24
|
+
("media", Datatype.image("png")),
|
25
|
+
],
|
26
|
+
"id",
|
27
|
+
)
|
28
|
+
|
29
|
+
|
30
|
+
class LoadAndScanSuite(BenchmarkSuite):
|
31
|
+
"""Load some number of rows and scan"""
|
32
|
+
|
33
|
+
schema_name = "LoadAndScanSuite"
|
34
|
+
|
35
|
+
def __init__(self, dataset: Dataset, schema: Schema, generator, description=None):
|
36
|
+
self.suite = "ReadSuite"
|
37
|
+
self.dataset: Dataset = dataset
|
38
|
+
self.schema = schema
|
39
|
+
self.dataset.add_schema(schema, LoadAndScanSuite.schema_name)
|
40
|
+
self.generator: RowGenerator = generator
|
41
|
+
self.description: str = description or f"{self.dataset} x {self.generator}"
|
42
|
+
|
43
|
+
def run(self) -> BenchmarkRun:
|
44
|
+
container = BenchmarkEngine(self.dataset)
|
45
|
+
run = BenchmarkRun(self.suite, self.description)
|
46
|
+
# load a large number of rows
|
47
|
+
manifest_uri, step = container.load_and_commit(
|
48
|
+
LoadAndScanSuite.schema_name, self.generator, 1000
|
49
|
+
)
|
50
|
+
run.add(step)
|
51
|
+
# do a full scan of all rows (and eagerly load them)
|
52
|
+
keys, step = container.scan()
|
53
|
+
run.add(step)
|
54
|
+
# randomly retrieve all keys one-by-one from the dataset
|
55
|
+
random_keys = list(keys)
|
56
|
+
shuffle(random_keys)
|
57
|
+
step = container.run_queries(
|
58
|
+
"load all keys individually",
|
59
|
+
manifest_uri,
|
60
|
+
[QueryExpression().with_key(k) for k in random_keys],
|
61
|
+
)
|
62
|
+
run.add(step)
|
63
|
+
# split into 4 key ranges and get them individually
|
64
|
+
quartiles = self._generate_quartiles(keys)
|
65
|
+
expressions = [
|
66
|
+
QueryExpression().with_range(start, end) for (start, end) in quartiles
|
67
|
+
]
|
68
|
+
step = container.run_queries(
|
69
|
+
"load key ranges by quartile", manifest_uri, expressions
|
70
|
+
)
|
71
|
+
run.add(step)
|
72
|
+
return run
|
73
|
+
|
74
|
+
@staticmethod
|
75
|
+
def _generate_quartiles(keys):
|
76
|
+
sorted_keys = sorted(keys)
|
77
|
+
size = len(keys)
|
78
|
+
starts = list(range(0, size, math.ceil(size / 4)))
|
79
|
+
ends = list([x - 1 for x in starts[1:]])
|
80
|
+
ends.append(size - 1)
|
81
|
+
quartiles = list(zip(starts, ends))
|
82
|
+
return [(sorted_keys[start], sorted_keys[end]) for (start, end) in quartiles]
|
83
|
+
|
84
|
+
|
85
|
+
def test_suite1(schema: Schema, report: BenchmarkReport):
|
86
|
+
with temp_dir_autocleanup() as temp_dir:
|
87
|
+
generator = RandomRowGenerator(123, temp_dir)
|
88
|
+
report.add(
|
89
|
+
LoadAndScanSuite(
|
90
|
+
Dataset(dataset_name="test_suite1_ds1", metadata_uri=temp_dir),
|
91
|
+
schema,
|
92
|
+
generator,
|
93
|
+
"SST (rand)",
|
94
|
+
).run()
|
95
|
+
)
|
96
|
+
|
97
|
+
with temp_dir_autocleanup() as temp_dir:
|
98
|
+
generator = RandomRowGenerator(123, temp_dir)
|
99
|
+
report.add(
|
100
|
+
LoadAndScanSuite(
|
101
|
+
Dataset(dataset_name="test_suite1_ds2", metadata_uri=temp_dir),
|
102
|
+
schema,
|
103
|
+
generator,
|
104
|
+
"dupe",
|
105
|
+
).run()
|
106
|
+
)
|
deltacat/catalog/__init__.py
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
from deltacat.catalog.model.properties import ( # noqa: F401
|
2
|
+
CatalogProperties,
|
3
|
+
get_catalog_properties,
|
4
|
+
)
|
5
|
+
from deltacat.catalog.model.catalog import Catalog, Catalogs # noqa: F401
|
6
|
+
from deltacat.catalog.main import impl as DeltacatCatalog
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"CatalogProperties",
|
10
|
+
"get_catalog_properties",
|
11
|
+
"Catalog",
|
12
|
+
"Catalogs",
|
13
|
+
"DeltacatCatalog",
|
14
|
+
]
|
deltacat/catalog/delegate.py
CHANGED
@@ -1,53 +1,33 @@
|
|
1
|
-
from typing import Any, Dict, List, Optional,
|
1
|
+
from typing import Any, Dict, List, Optional, Union
|
2
2
|
|
3
|
-
|
4
|
-
import ray
|
5
|
-
|
6
|
-
from deltacat.catalog.model.catalog import Catalog, all_catalogs
|
3
|
+
from deltacat.catalog.model.catalog import get_catalog
|
7
4
|
from deltacat.catalog.model.table_definition import TableDefinition
|
8
|
-
from deltacat.storage.model.
|
5
|
+
from deltacat.storage.model.partition import PartitionScheme
|
6
|
+
from deltacat.storage.model.sort_key import SortScheme
|
9
7
|
from deltacat.storage.model.list_result import ListResult
|
10
|
-
from deltacat.storage.model.namespace import Namespace
|
8
|
+
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
9
|
+
from deltacat.storage.model.schema import Schema
|
10
|
+
from deltacat.storage.model.table import TableProperties
|
11
11
|
from deltacat.storage.model.types import (
|
12
12
|
DistributedDataset,
|
13
13
|
LifecycleState,
|
14
14
|
LocalDataset,
|
15
15
|
LocalTable,
|
16
|
-
|
16
|
+
StreamFormat,
|
17
17
|
)
|
18
18
|
from deltacat.types.media import ContentType
|
19
19
|
from deltacat.types.tables import TableWriteMode
|
20
20
|
|
21
21
|
|
22
|
-
def _get_catalog(name: Optional[str] = None) -> Catalog:
|
23
|
-
if not all_catalogs:
|
24
|
-
raise ValueError(
|
25
|
-
"No catalogs available! Call "
|
26
|
-
"`deltacat.init(catalogs={...})` to register one or more "
|
27
|
-
"catalogs then retry."
|
28
|
-
)
|
29
|
-
catalog = (
|
30
|
-
ray.get(all_catalogs.get.remote(name))
|
31
|
-
if name
|
32
|
-
else ray.get(all_catalogs.default.remote())
|
33
|
-
)
|
34
|
-
if not catalog:
|
35
|
-
available_catalogs = ray.get(all_catalogs.all.remote()).values()
|
36
|
-
raise ValueError(
|
37
|
-
f"Catalog '{name}' not found. Available catalogs: " f"{available_catalogs}."
|
38
|
-
)
|
39
|
-
return catalog
|
40
|
-
|
41
|
-
|
42
22
|
# table functions
|
43
23
|
def write_to_table(
|
44
24
|
data: Union[LocalTable, LocalDataset, DistributedDataset],
|
45
25
|
table: str,
|
26
|
+
*args,
|
46
27
|
namespace: Optional[str] = None,
|
47
28
|
catalog: Optional[str] = None,
|
48
29
|
mode: TableWriteMode = TableWriteMode.AUTO,
|
49
30
|
content_type: ContentType = ContentType.PARQUET,
|
50
|
-
*args,
|
51
31
|
**kwargs,
|
52
32
|
) -> None:
|
53
33
|
"""Write local or distributed data to a table. Raises an error if the
|
@@ -57,228 +37,341 @@ def write_to_table(
|
|
57
37
|
specified as additional keyword arguments. When appending to, or replacing,
|
58
38
|
an existing table, all `alter_table` parameters may be optionally specified
|
59
39
|
as additional keyword arguments."""
|
60
|
-
|
61
|
-
|
40
|
+
catalog_obj = get_catalog(catalog)
|
41
|
+
catalog_obj.impl.write_to_table(
|
42
|
+
data,
|
43
|
+
table,
|
44
|
+
*args,
|
45
|
+
namespace=namespace,
|
46
|
+
mode=mode,
|
47
|
+
content_type=content_type,
|
48
|
+
inner=catalog_obj.inner,
|
49
|
+
**kwargs,
|
62
50
|
)
|
63
51
|
|
64
52
|
|
65
53
|
def read_table(
|
66
54
|
table: str,
|
55
|
+
*args,
|
67
56
|
namespace: Optional[str] = None,
|
68
57
|
catalog: Optional[str] = None,
|
69
|
-
*args,
|
70
58
|
**kwargs,
|
71
59
|
) -> DistributedDataset:
|
72
60
|
"""Read a table into a distributed dataset."""
|
73
|
-
|
61
|
+
catalog_obj = get_catalog(catalog)
|
62
|
+
return catalog_obj.impl.read_table(
|
63
|
+
table,
|
64
|
+
*args,
|
65
|
+
namespace=namespace,
|
66
|
+
inner=catalog_obj.inner,
|
67
|
+
**kwargs,
|
68
|
+
)
|
74
69
|
|
75
70
|
|
76
71
|
def alter_table(
|
77
72
|
table: str,
|
73
|
+
*args,
|
78
74
|
namespace: Optional[str] = None,
|
79
75
|
catalog: Optional[str] = None,
|
80
76
|
lifecycle_state: Optional[LifecycleState] = None,
|
81
77
|
schema_updates: Optional[Dict[str, Any]] = None,
|
82
78
|
partition_updates: Optional[Dict[str, Any]] = None,
|
83
|
-
|
84
|
-
sort_keys: Optional[List[SortKey]] = None,
|
79
|
+
sort_keys: Optional[SortScheme] = None,
|
85
80
|
description: Optional[str] = None,
|
86
|
-
properties: Optional[
|
87
|
-
*args,
|
81
|
+
properties: Optional[TableProperties] = None,
|
88
82
|
**kwargs,
|
89
83
|
) -> None:
|
90
84
|
"""Alter table definition."""
|
91
|
-
|
85
|
+
catalog_obj = get_catalog(catalog)
|
86
|
+
catalog_obj.impl.alter_table(
|
92
87
|
table,
|
93
|
-
namespace,
|
94
|
-
lifecycle_state,
|
95
|
-
schema_updates,
|
96
|
-
partition_updates,
|
97
|
-
primary_keys,
|
98
|
-
sort_keys,
|
99
|
-
description,
|
100
|
-
properties,
|
101
88
|
*args,
|
89
|
+
namespace=namespace,
|
90
|
+
lifecycle_state=lifecycle_state,
|
91
|
+
schema_updates=schema_updates,
|
92
|
+
partition_updates=partition_updates,
|
93
|
+
sort_keys=sort_keys,
|
94
|
+
description=description,
|
95
|
+
properties=properties,
|
96
|
+
inner=catalog_obj.inner,
|
102
97
|
**kwargs,
|
103
98
|
)
|
104
99
|
|
105
100
|
|
106
101
|
def create_table(
|
107
|
-
|
102
|
+
name: str,
|
103
|
+
*args,
|
108
104
|
namespace: Optional[str] = None,
|
109
105
|
catalog: Optional[str] = None,
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
sort_keys: Optional[List[SortKey]] = None,
|
106
|
+
version: Optional[str] = None,
|
107
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
|
108
|
+
schema: Optional[Schema] = None,
|
109
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
110
|
+
sort_keys: Optional[SortScheme] = None,
|
116
111
|
description: Optional[str] = None,
|
117
|
-
|
118
|
-
|
112
|
+
table_properties: Optional[TableProperties] = None,
|
113
|
+
namespace_properties: Optional[NamespaceProperties] = None,
|
119
114
|
content_types: Optional[List[ContentType]] = None,
|
120
|
-
|
121
|
-
*args,
|
115
|
+
fail_if_exists: bool = True,
|
122
116
|
**kwargs,
|
123
117
|
) -> TableDefinition:
|
124
118
|
"""Create an empty table. Raises an error if the table already exists and
|
125
|
-
`
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
lifecycle_state,
|
130
|
-
schema,
|
131
|
-
schema_consistency,
|
132
|
-
partition_keys,
|
133
|
-
primary_keys,
|
134
|
-
sort_keys,
|
135
|
-
description,
|
136
|
-
properties,
|
137
|
-
permissions,
|
138
|
-
content_types,
|
139
|
-
replace_existing_table,
|
119
|
+
`fail_if_exists` is True (default behavior)."""
|
120
|
+
catalog_obj = get_catalog(catalog)
|
121
|
+
return catalog_obj.impl.create_table(
|
122
|
+
name,
|
140
123
|
*args,
|
124
|
+
namespace=namespace,
|
125
|
+
version=version,
|
126
|
+
lifecycle_state=lifecycle_state,
|
127
|
+
schema=schema,
|
128
|
+
partition_scheme=partition_scheme,
|
129
|
+
sort_keys=sort_keys,
|
130
|
+
description=description,
|
131
|
+
table_properties=table_properties,
|
132
|
+
namespace_properties=namespace_properties,
|
133
|
+
content_types=content_types,
|
134
|
+
fail_if_exists=fail_if_exists,
|
135
|
+
inner=catalog_obj.inner,
|
141
136
|
**kwargs,
|
142
137
|
)
|
143
138
|
|
144
139
|
|
145
140
|
def drop_table(
|
146
|
-
|
141
|
+
name: str,
|
142
|
+
*args,
|
147
143
|
namespace: Optional[str] = None,
|
148
144
|
catalog: Optional[str] = None,
|
145
|
+
table_version: Optional[str] = None,
|
149
146
|
purge: bool = False,
|
150
|
-
*args,
|
151
147
|
**kwargs,
|
152
148
|
) -> None:
|
153
149
|
"""Drop a table from the catalog and optionally purge it. Raises an error
|
154
150
|
if the table does not exist."""
|
155
|
-
|
151
|
+
catalog_obj = get_catalog(catalog)
|
152
|
+
catalog_obj.impl.drop_table(
|
153
|
+
name,
|
154
|
+
*args,
|
155
|
+
namespace=namespace,
|
156
|
+
table_version=table_version,
|
157
|
+
purge=purge,
|
158
|
+
inner=catalog_obj.inner,
|
159
|
+
**kwargs,
|
160
|
+
)
|
156
161
|
|
157
162
|
|
158
163
|
def refresh_table(
|
159
164
|
table: str,
|
165
|
+
*args,
|
160
166
|
namespace: Optional[str] = None,
|
161
167
|
catalog: Optional[str] = None,
|
162
|
-
*args,
|
163
168
|
**kwargs,
|
164
169
|
) -> None:
|
165
170
|
"""Refresh metadata cached on the Ray cluster for the given table."""
|
166
|
-
|
171
|
+
catalog_obj = get_catalog(catalog)
|
172
|
+
catalog_obj.impl.refresh_table(
|
173
|
+
table,
|
174
|
+
*args,
|
175
|
+
namespace=namespace,
|
176
|
+
inner=catalog_obj.inner,
|
177
|
+
**kwargs,
|
178
|
+
)
|
167
179
|
|
168
180
|
|
169
181
|
def list_tables(
|
170
|
-
namespace: Optional[str] = None, catalog: Optional[str] = None,
|
182
|
+
*args, namespace: Optional[str] = None, catalog: Optional[str] = None, **kwargs
|
171
183
|
) -> ListResult[TableDefinition]:
|
172
184
|
"""List a page of table definitions. Raises an error if the given namespace
|
173
185
|
does not exist."""
|
174
|
-
|
186
|
+
catalog_obj = get_catalog(catalog)
|
187
|
+
return catalog_obj.impl.list_tables(
|
188
|
+
*args,
|
189
|
+
namespace=namespace,
|
190
|
+
inner=catalog_obj.inner,
|
191
|
+
**kwargs,
|
192
|
+
)
|
175
193
|
|
176
194
|
|
177
195
|
def get_table(
|
178
|
-
|
196
|
+
name: str,
|
197
|
+
*args,
|
179
198
|
namespace: Optional[str] = None,
|
180
199
|
catalog: Optional[str] = None,
|
181
|
-
|
200
|
+
table_version: Optional[str] = None,
|
201
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
182
202
|
**kwargs,
|
183
203
|
) -> Optional[TableDefinition]:
|
184
204
|
"""Get table definition metadata. Returns None if the given table does not
|
185
205
|
exist."""
|
186
|
-
|
206
|
+
catalog_obj = get_catalog(catalog)
|
207
|
+
return catalog_obj.impl.get_table(
|
208
|
+
name,
|
209
|
+
*args,
|
210
|
+
namespace=namespace,
|
211
|
+
table_version=table_version,
|
212
|
+
stream_format=stream_format,
|
213
|
+
inner=catalog_obj.inner,
|
214
|
+
**kwargs,
|
215
|
+
)
|
187
216
|
|
188
217
|
|
189
218
|
def truncate_table(
|
190
219
|
table: str,
|
220
|
+
*args,
|
191
221
|
namespace: Optional[str] = None,
|
192
222
|
catalog: Optional[str] = None,
|
193
|
-
*args,
|
194
223
|
**kwargs,
|
195
224
|
) -> None:
|
196
225
|
"""Truncate table data. Raises an error if the table does not exist."""
|
197
|
-
|
226
|
+
catalog_obj = get_catalog(catalog)
|
227
|
+
catalog_obj.impl.truncate_table(
|
228
|
+
table,
|
229
|
+
*args,
|
230
|
+
namespace=namespace,
|
231
|
+
inner=catalog_obj.inner,
|
232
|
+
**kwargs,
|
233
|
+
)
|
198
234
|
|
199
235
|
|
200
236
|
def rename_table(
|
201
237
|
table: str,
|
202
238
|
new_name: str,
|
239
|
+
*args,
|
203
240
|
namespace: Optional[str] = None,
|
204
241
|
catalog: Optional[str] = None,
|
205
|
-
*args,
|
206
242
|
**kwargs,
|
207
243
|
) -> None:
|
208
244
|
"""Rename a table."""
|
209
|
-
|
245
|
+
catalog_obj = get_catalog(catalog)
|
246
|
+
catalog_obj.impl.rename_table(
|
247
|
+
table,
|
248
|
+
new_name,
|
249
|
+
*args,
|
250
|
+
namespace=namespace,
|
251
|
+
inner=catalog_obj.inner,
|
252
|
+
**kwargs,
|
253
|
+
)
|
210
254
|
|
211
255
|
|
212
256
|
def table_exists(
|
213
257
|
table: str,
|
258
|
+
*args,
|
214
259
|
namespace: Optional[str] = None,
|
215
260
|
catalog: Optional[str] = None,
|
216
|
-
*args,
|
217
261
|
**kwargs,
|
218
262
|
) -> bool:
|
219
263
|
"""Returns True if the given table exists, False if not."""
|
220
|
-
|
264
|
+
catalog_obj = get_catalog(catalog)
|
265
|
+
return catalog_obj.impl.table_exists(
|
266
|
+
table,
|
267
|
+
*args,
|
268
|
+
namespace=namespace,
|
269
|
+
inner=catalog_obj.inner,
|
270
|
+
**kwargs,
|
271
|
+
)
|
221
272
|
|
222
273
|
|
223
274
|
# namespace functions
|
224
275
|
def list_namespaces(
|
225
|
-
catalog: Optional[str] = None,
|
276
|
+
*args, catalog: Optional[str] = None, **kwargs
|
226
277
|
) -> ListResult[Namespace]:
|
227
278
|
"""List a page of table namespaces."""
|
228
|
-
|
279
|
+
catalog_obj = get_catalog(catalog)
|
280
|
+
return catalog_obj.impl.list_namespaces(
|
281
|
+
*args,
|
282
|
+
inner=catalog_obj.inner,
|
283
|
+
**kwargs,
|
284
|
+
)
|
229
285
|
|
230
286
|
|
231
287
|
def get_namespace(
|
232
|
-
namespace: str,
|
288
|
+
namespace: str,
|
289
|
+
catalog: Optional[str] = None,
|
290
|
+
*args,
|
291
|
+
**kwargs,
|
233
292
|
) -> Optional[Namespace]:
|
234
293
|
"""Get table namespace metadata for the specified table namespace. Returns
|
235
294
|
None if the given namespace does not exist."""
|
236
|
-
|
295
|
+
catalog_obj = get_catalog(catalog)
|
296
|
+
return catalog_obj.impl.get_namespace(
|
297
|
+
namespace,
|
298
|
+
*args,
|
299
|
+
inner=catalog_obj.inner,
|
300
|
+
**kwargs,
|
301
|
+
)
|
237
302
|
|
238
303
|
|
239
304
|
def namespace_exists(
|
240
|
-
namespace: str,
|
305
|
+
namespace: str,
|
306
|
+
catalog: Optional[str] = None,
|
307
|
+
*args,
|
308
|
+
**kwargs,
|
241
309
|
) -> bool:
|
242
310
|
"""Returns True if the given table namespace exists, False if not."""
|
243
|
-
|
311
|
+
catalog_obj = get_catalog(catalog)
|
312
|
+
return catalog_obj.impl.namespace_exists(
|
313
|
+
namespace,
|
314
|
+
*args,
|
315
|
+
inner=catalog_obj.inner,
|
316
|
+
**kwargs,
|
317
|
+
)
|
244
318
|
|
245
319
|
|
246
320
|
def create_namespace(
|
247
321
|
namespace: str,
|
248
|
-
|
322
|
+
properties: Optional[NamespaceProperties] = None,
|
249
323
|
catalog: Optional[str] = None,
|
250
324
|
*args,
|
251
325
|
**kwargs,
|
252
326
|
) -> Namespace:
|
253
|
-
"""Creates a table namespace with the given name and
|
327
|
+
"""Creates a table namespace with the given name and properties. Returns
|
254
328
|
the created namespace. Raises an error if the namespace already exists."""
|
255
|
-
|
256
|
-
|
329
|
+
catalog_obj = get_catalog(catalog)
|
330
|
+
return catalog_obj.impl.create_namespace(
|
331
|
+
namespace,
|
332
|
+
*args,
|
333
|
+
properties=properties,
|
334
|
+
inner=catalog_obj.inner,
|
335
|
+
**kwargs,
|
257
336
|
)
|
258
337
|
|
259
338
|
|
260
339
|
def alter_namespace(
|
261
340
|
namespace: str,
|
341
|
+
*args,
|
262
342
|
catalog: Optional[str] = None,
|
263
|
-
|
343
|
+
properties: Optional[NamespaceProperties] = None,
|
264
344
|
new_namespace: Optional[str] = None,
|
265
|
-
*args,
|
266
345
|
**kwargs,
|
267
346
|
) -> None:
|
268
347
|
"""Alter table namespace definition."""
|
269
|
-
|
270
|
-
|
348
|
+
catalog_obj = get_catalog(catalog)
|
349
|
+
catalog_obj.impl.alter_namespace(
|
350
|
+
namespace,
|
351
|
+
*args,
|
352
|
+
properties=properties,
|
353
|
+
new_namespace=new_namespace,
|
354
|
+
inner=catalog_obj.inner,
|
355
|
+
**kwargs,
|
271
356
|
)
|
272
357
|
|
273
358
|
|
274
359
|
def drop_namespace(
|
275
|
-
namespace: str, catalog: Optional[str] = None, purge: bool = False,
|
360
|
+
namespace: str, *args, catalog: Optional[str] = None, purge: bool = False, **kwargs
|
276
361
|
) -> None:
|
277
362
|
"""Drop the given namespace and all of its tables from the catalog,
|
278
363
|
optionally purging them."""
|
279
|
-
|
364
|
+
catalog_obj = get_catalog(catalog)
|
365
|
+
catalog_obj.impl.drop_namespace(
|
366
|
+
namespace,
|
367
|
+
*args,
|
368
|
+
purge=purge,
|
369
|
+
inner=catalog_obj.inner,
|
370
|
+
**kwargs,
|
371
|
+
)
|
280
372
|
|
281
373
|
|
282
|
-
def default_namespace(catalog: Optional[str] = None) -> str:
|
374
|
+
def default_namespace(*args, catalog: Optional[str] = None, **kwargs) -> str:
|
283
375
|
"""Returns the default namespace for the catalog."""
|
284
|
-
|
376
|
+
catalog_obj = get_catalog(catalog)
|
377
|
+
return catalog_obj.impl.default_namespace(*args, inner=catalog_obj.inner, **kwargs)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
from attr import dataclass
|
5
|
+
from pyiceberg.catalog import CatalogType
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class IcebergCatalogConfig:
|
10
|
+
"""
|
11
|
+
Configuration properties for Iceberg catalog implementation.
|
12
|
+
|
13
|
+
This class holds the PyIceberg Catalog instance needed for interaction with
|
14
|
+
Iceberg tables and metadata.
|
15
|
+
|
16
|
+
This configuration is passed through to PyIceberg by invoking load_catalog.
|
17
|
+
The Properties provided must match properties accepted by PyIceberg for each catalog type
|
18
|
+
See: :func:`deltacat.catalog.iceberg.initialize`
|
19
|
+
|
20
|
+
Attributes:
|
21
|
+
type: The PyIceberg Catalog instance
|
22
|
+
properties: Dict of properties passed to pyiceberg load_catalog
|
23
|
+
"""
|
24
|
+
|
25
|
+
type: CatalogType
|
26
|
+
properties: Dict[str, Any]
|