deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,368 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
4
|
+
|
5
|
+
from daft import DataFrame
|
6
|
+
|
7
|
+
from deltacat import logs
|
8
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
9
|
+
from deltacat.exceptions import TableAlreadyExistsError
|
10
|
+
from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
|
11
|
+
from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
|
12
|
+
from deltacat.storage.model.partition import PartitionScheme
|
13
|
+
from deltacat.storage.iceberg.impl import _get_native_catalog
|
14
|
+
from deltacat.storage.model.sort_key import SortScheme
|
15
|
+
from deltacat.storage.model.list_result import ListResult
|
16
|
+
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
17
|
+
from deltacat.storage.model.schema import Schema
|
18
|
+
from deltacat.storage.model.table import TableProperties
|
19
|
+
from deltacat.storage.model.types import (
|
20
|
+
DistributedDataset,
|
21
|
+
LifecycleState,
|
22
|
+
LocalDataset,
|
23
|
+
LocalTable,
|
24
|
+
StreamFormat,
|
25
|
+
)
|
26
|
+
from deltacat.storage.iceberg import impl as IcebergStorage
|
27
|
+
from deltacat.types.media import ContentType
|
28
|
+
from deltacat.types.tables import TableWriteMode
|
29
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
30
|
+
from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
|
31
|
+
|
32
|
+
from pyiceberg.catalog import Catalog, load_catalog
|
33
|
+
from pyiceberg.transforms import BucketTransform
|
34
|
+
|
35
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
|
+
|
37
|
+
|
38
|
+
# catalog functions
|
39
|
+
def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
|
40
|
+
"""
|
41
|
+
Initializes an Iceberg catalog with the given config.
|
42
|
+
|
43
|
+
NOTE: because PyIceberg catalogs are not pickle-able, we cannot accept them as catalog initialization parameters,
|
44
|
+
since catalog initialization parameters are passed to Ray actors (see: :class:`deltacat.catalog.Catalogs`)
|
45
|
+
|
46
|
+
Args:
|
47
|
+
**kwargs: Arguments to be passed to PyIceberg Catalog.
|
48
|
+
If 'catalog' is provided as a PyIceberg Catalog instance, it will be used directly.
|
49
|
+
Otherwise, the arguments will be used to load a catalog via pyiceberg.catalog.load_catalog.
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
IcebergCatalogConfig: Configuration wrapper containing the PyIceberg Catalog.
|
53
|
+
"""
|
54
|
+
|
55
|
+
# If no catalog is provided, try to load one with PyIceberg
|
56
|
+
|
57
|
+
load_catalog_args = {"type": config.type.value, **config.properties, **kwargs}
|
58
|
+
catalog = load_catalog(**load_catalog_args)
|
59
|
+
return catalog
|
60
|
+
|
61
|
+
|
62
|
+
# table functions
|
63
|
+
def write_to_table(
|
64
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset],
|
65
|
+
table: str,
|
66
|
+
*args,
|
67
|
+
namespace: Optional[str] = None,
|
68
|
+
mode: TableWriteMode = TableWriteMode.AUTO,
|
69
|
+
content_type: ContentType = ContentType.PARQUET,
|
70
|
+
**kwargs,
|
71
|
+
) -> None:
|
72
|
+
"""Write local or distributed data to a table. Raises an error if the
|
73
|
+
table does not exist and the table write mode is not CREATE or AUTO.
|
74
|
+
|
75
|
+
When creating a table, all `create_table` parameters may be optionally
|
76
|
+
specified as additional keyword arguments. When appending to, or replacing,
|
77
|
+
an existing table, all `alter_table` parameters may be optionally specified
|
78
|
+
as additional keyword arguments."""
|
79
|
+
|
80
|
+
# TODO (pdames): derive schema automatically from data if not
|
81
|
+
# explicitly specified in kwargs, and table needs to be created
|
82
|
+
# kwargs["schema"] = kwargs["schema"] or derived_schema
|
83
|
+
kwargs["fail_if_exists"] = mode == TableWriteMode.CREATE
|
84
|
+
table_definition = (
|
85
|
+
create_table(
|
86
|
+
table,
|
87
|
+
namespace=namespace,
|
88
|
+
*args,
|
89
|
+
**kwargs,
|
90
|
+
)
|
91
|
+
if (mode == TableWriteMode.AUTO or mode == TableWriteMode.CREATE)
|
92
|
+
else get_table(table, namespace=namespace, *args, **kwargs)
|
93
|
+
)
|
94
|
+
|
95
|
+
# TODO(pdames): Use native DeltaCAT models to map from Iceberg partitioning to Daft partitioning...
|
96
|
+
# this lets us re-use a single model-mapper instead of different per-catalog model mappers
|
97
|
+
schema = SchemaMapper.unmap(table_definition.table_version.schema)
|
98
|
+
partition_spec = PartitionSchemeMapper.unmap(
|
99
|
+
table_definition.table_version.partition_scheme,
|
100
|
+
schema,
|
101
|
+
)
|
102
|
+
if isinstance(data, DataFrame):
|
103
|
+
for partition_field in partition_spec.fields:
|
104
|
+
if isinstance(partition_field.transform, BucketTransform):
|
105
|
+
ice_bucket_transform: BucketTransform = partition_field.transform
|
106
|
+
# TODO(pdames): Get a type-checked Iceberg Table automatically via unmap()
|
107
|
+
table_location = table_definition.table.native_object.location()
|
108
|
+
path = kwargs.get("path") or f"{table_location}/data"
|
109
|
+
if content_type == ContentType.PARQUET:
|
110
|
+
source_field = schema.find_field(
|
111
|
+
name_or_id=partition_field.source_id
|
112
|
+
)
|
113
|
+
out_df = data.write_parquet(
|
114
|
+
path,
|
115
|
+
partition_cols=[
|
116
|
+
data[source_field.name].partitioning.iceberg_bucket(
|
117
|
+
ice_bucket_transform.num_buckets
|
118
|
+
),
|
119
|
+
],
|
120
|
+
)
|
121
|
+
# TODO(pdames): only append s3:// to output file paths when writing to S3!
|
122
|
+
out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
|
123
|
+
from deltacat.catalog.iceberg import overrides
|
124
|
+
|
125
|
+
overrides.append(
|
126
|
+
table_definition.table.native_object,
|
127
|
+
out_file_paths,
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
raise NotImplementedError(
|
131
|
+
f"iceberg writes not implemented for content type: {content_type}"
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
raise NotImplementedError(
|
135
|
+
f"daft partitioning not implemented for iceberg transform: {partition_field.transform}"
|
136
|
+
)
|
137
|
+
else:
|
138
|
+
raise NotImplementedError(
|
139
|
+
f"iceberg write-back not implemented for data type: {type(data)}"
|
140
|
+
)
|
141
|
+
|
142
|
+
|
143
|
+
def read_table(
|
144
|
+
table: str, *args, namespace: Optional[str] = None, **kwargs
|
145
|
+
) -> DistributedDataset:
|
146
|
+
"""Read a table into a distributed dataset."""
|
147
|
+
raise NotImplementedError("read_table not implemented")
|
148
|
+
|
149
|
+
|
150
|
+
def alter_table(
|
151
|
+
table: str,
|
152
|
+
*args,
|
153
|
+
namespace: Optional[str] = None,
|
154
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
155
|
+
schema_updates: Optional[Dict[str, Any]] = None,
|
156
|
+
partition_updates: Optional[Dict[str, Any]] = None,
|
157
|
+
sort_keys: Optional[SortScheme] = None,
|
158
|
+
description: Optional[str] = None,
|
159
|
+
properties: Optional[TableProperties] = None,
|
160
|
+
**kwargs,
|
161
|
+
) -> None:
|
162
|
+
"""Alter table definition."""
|
163
|
+
raise NotImplementedError("alter_table not implemented")
|
164
|
+
|
165
|
+
|
166
|
+
def create_table(
|
167
|
+
name: str,
|
168
|
+
*args,
|
169
|
+
namespace: Optional[str] = None,
|
170
|
+
version: Optional[str] = None,
|
171
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
172
|
+
schema: Optional[Schema] = None,
|
173
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
174
|
+
sort_keys: Optional[SortScheme] = None,
|
175
|
+
description: Optional[str] = None,
|
176
|
+
table_properties: Optional[TableProperties] = None,
|
177
|
+
namespace_properties: Optional[NamespaceProperties] = None,
|
178
|
+
content_types: Optional[List[ContentType]] = None,
|
179
|
+
fail_if_exists: bool = True,
|
180
|
+
**kwargs,
|
181
|
+
) -> TableDefinition:
|
182
|
+
"""Create an empty table in the catalog"""
|
183
|
+
|
184
|
+
namespace = namespace or default_namespace()
|
185
|
+
existing_table = get_table(
|
186
|
+
name,
|
187
|
+
*args,
|
188
|
+
namespace=namespace,
|
189
|
+
**kwargs,
|
190
|
+
)
|
191
|
+
if existing_table:
|
192
|
+
if fail_if_exists:
|
193
|
+
err_msg = (
|
194
|
+
f"Table `{namespace}.{name}` already exists. "
|
195
|
+
f"To suppress this error, rerun `create_table()` with "
|
196
|
+
f"`fail_if_exists=False`."
|
197
|
+
)
|
198
|
+
raise TableAlreadyExistsError(err_msg)
|
199
|
+
else:
|
200
|
+
logger.debug(f"Returning existing table: `{namespace}.{name}`")
|
201
|
+
return existing_table
|
202
|
+
|
203
|
+
if not IcebergStorage.namespace_exists(namespace, **kwargs):
|
204
|
+
logger.debug(f"Namespace {namespace} doesn't exist. Creating it...")
|
205
|
+
IcebergStorage.create_namespace(
|
206
|
+
namespace,
|
207
|
+
properties=namespace_properties or {},
|
208
|
+
**kwargs,
|
209
|
+
)
|
210
|
+
|
211
|
+
IcebergStorage.create_table_version(
|
212
|
+
namespace=namespace,
|
213
|
+
table_name=name,
|
214
|
+
table_version=version,
|
215
|
+
schema=schema,
|
216
|
+
partition_scheme=partition_scheme,
|
217
|
+
sort_keys=sort_keys,
|
218
|
+
table_properties=table_properties,
|
219
|
+
**kwargs,
|
220
|
+
)
|
221
|
+
|
222
|
+
return get_table(
|
223
|
+
name,
|
224
|
+
*args,
|
225
|
+
namespace=namespace,
|
226
|
+
**kwargs,
|
227
|
+
)
|
228
|
+
|
229
|
+
|
230
|
+
def drop_table(
|
231
|
+
name: str,
|
232
|
+
*args,
|
233
|
+
namespace: Optional[str] = None,
|
234
|
+
table_version: Optional[str] = None,
|
235
|
+
purge: bool = False,
|
236
|
+
**kwargs,
|
237
|
+
) -> None:
|
238
|
+
"""Drop a table from the catalog and optionally purge it. Raises an error
|
239
|
+
if the table does not exist."""
|
240
|
+
raise NotImplementedError("drop_table not implemented")
|
241
|
+
|
242
|
+
|
243
|
+
def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
|
244
|
+
"""Refresh metadata cached on the Ray cluster for the given table."""
|
245
|
+
raise NotImplementedError("refresh_table not implemented")
|
246
|
+
|
247
|
+
|
248
|
+
def list_tables(
|
249
|
+
*args, namespace: Optional[str] = None, **kwargs
|
250
|
+
) -> ListResult[TableDefinition]:
|
251
|
+
"""List a page of table definitions. Raises an error if the given namespace
|
252
|
+
does not exist."""
|
253
|
+
raise NotImplementedError("list_tables not implemented")
|
254
|
+
|
255
|
+
|
256
|
+
def get_table(
|
257
|
+
name: str,
|
258
|
+
*args,
|
259
|
+
namespace: Optional[str] = None,
|
260
|
+
table_version: Optional[str] = None,
|
261
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
262
|
+
**kwargs,
|
263
|
+
) -> Optional[TableDefinition]:
|
264
|
+
"""Get table definition metadata.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
name: Name of the table to retrieve
|
268
|
+
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
269
|
+
table_version: Optional specific version of the table to retrieve.
|
270
|
+
If not specified, the latest version is used.
|
271
|
+
stream_format: Optional stream format to retrieve
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
Deltacat TableDefinition if the table exists, None otherwise.
|
275
|
+
"""
|
276
|
+
namespace = namespace or default_namespace()
|
277
|
+
stream = IcebergStorage.get_stream(namespace=namespace, table_name=name, **kwargs)
|
278
|
+
if not stream:
|
279
|
+
return None
|
280
|
+
table_obj = IcebergStorage.get_table(namespace=namespace, table_name=name, **kwargs)
|
281
|
+
if not table_obj:
|
282
|
+
return None
|
283
|
+
table_version_obj = None
|
284
|
+
if table_version:
|
285
|
+
table_version_obj = IcebergStorage.get_table_version(
|
286
|
+
namespace=namespace, table_name=name, table_version=table_version, **kwargs
|
287
|
+
)
|
288
|
+
else:
|
289
|
+
table_version_obj = IcebergStorage.get_latest_table_version(
|
290
|
+
namespace=namespace, table_name=name, **kwargs
|
291
|
+
)
|
292
|
+
if not table_version_obj:
|
293
|
+
return None
|
294
|
+
scan_planner = IcebergScanPlanner(_get_native_catalog(**kwargs))
|
295
|
+
return TableDefinition.of(
|
296
|
+
table=table_obj,
|
297
|
+
table_version=table_version_obj,
|
298
|
+
stream=stream,
|
299
|
+
native_object=table_obj.native_object,
|
300
|
+
scan_planner=scan_planner,
|
301
|
+
)
|
302
|
+
|
303
|
+
|
304
|
+
def truncate_table(
|
305
|
+
table: str, *args, namespace: Optional[str] = None, **kwargs
|
306
|
+
) -> None:
|
307
|
+
"""Truncate table data. Raises an error if the table does not exist."""
|
308
|
+
raise NotImplementedError("truncate_table not implemented")
|
309
|
+
|
310
|
+
|
311
|
+
def rename_table(
|
312
|
+
table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
|
313
|
+
) -> None:
|
314
|
+
"""Rename a table."""
|
315
|
+
raise NotImplementedError("rename_table not implemented")
|
316
|
+
|
317
|
+
|
318
|
+
def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
|
319
|
+
"""Returns True if the given table exists, False if not."""
|
320
|
+
namespace = namespace or default_namespace()
|
321
|
+
return IcebergStorage.table_exists(namespace=namespace, table_name=table, **kwargs)
|
322
|
+
|
323
|
+
|
324
|
+
# namespace functions
|
325
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
326
|
+
"""List a page of table namespaces."""
|
327
|
+
return IcebergStorage.list_namespaces(**kwargs)
|
328
|
+
|
329
|
+
|
330
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
331
|
+
"""Gets table namespace metadata for the specified table namespace. Returns
|
332
|
+
None if the given namespace does not exist."""
|
333
|
+
return IcebergStorage.get_namespace(namespace, **kwargs)
|
334
|
+
|
335
|
+
|
336
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
337
|
+
"""Returns True if the given table namespace exists, False if not."""
|
338
|
+
return IcebergStorage.namespace_exists(namespace, **kwargs)
|
339
|
+
|
340
|
+
|
341
|
+
def create_namespace(
|
342
|
+
namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
|
343
|
+
) -> Namespace:
|
344
|
+
"""Creates a table namespace with the given name and properties. Returns
|
345
|
+
the created namespace. Raises an error if the namespace already exists."""
|
346
|
+
raise NotImplementedError("create_namespace not implemented")
|
347
|
+
|
348
|
+
|
349
|
+
def alter_namespace(
|
350
|
+
namespace: str,
|
351
|
+
*args,
|
352
|
+
properties: Optional[NamespaceProperties] = None,
|
353
|
+
new_namespace: Optional[str] = None,
|
354
|
+
**kwargs,
|
355
|
+
) -> None:
|
356
|
+
"""Alter table namespace definition."""
|
357
|
+
raise NotImplementedError("alter_namespace not implemented")
|
358
|
+
|
359
|
+
|
360
|
+
def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
|
361
|
+
"""Drop the given namespace and all of its tables from the catalog,
|
362
|
+
optionally purging them."""
|
363
|
+
raise NotImplementedError("drop_namespace not implemented")
|
364
|
+
|
365
|
+
|
366
|
+
def default_namespace(*args, **kwargs) -> str:
|
367
|
+
"""Returns the default namespace for the catalog."""
|
368
|
+
return DEFAULT_NAMESPACE
|
@@ -0,0 +1,74 @@
|
|
1
|
+
import pyarrow.parquet as pq
|
2
|
+
|
3
|
+
from typing import Iterator, List
|
4
|
+
|
5
|
+
from pyarrow.fs import FileSystem
|
6
|
+
|
7
|
+
from pyiceberg.io.pyarrow import (
|
8
|
+
fill_parquet_file_metadata,
|
9
|
+
compute_statistics_plan,
|
10
|
+
parquet_path_to_id_mapping,
|
11
|
+
)
|
12
|
+
from pyiceberg.table import Table, _MergingSnapshotProducer
|
13
|
+
from pyiceberg.table.snapshots import Operation
|
14
|
+
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
15
|
+
from pyiceberg.types import StructType, NestedField, IntegerType
|
16
|
+
from pyiceberg.typedef import Record
|
17
|
+
|
18
|
+
|
19
|
+
def append(table: Table, paths: List[str]) -> None:
|
20
|
+
"""
|
21
|
+
Append files to the table.
|
22
|
+
"""
|
23
|
+
# if len(table.sort_order().fields) > 0:
|
24
|
+
# raise ValueError("Cannot write to tables with a sort-order")
|
25
|
+
|
26
|
+
data_files = write_file(table, paths)
|
27
|
+
merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=table)
|
28
|
+
for data_file in data_files:
|
29
|
+
merge.append_data_file(data_file)
|
30
|
+
|
31
|
+
merge.commit()
|
32
|
+
|
33
|
+
|
34
|
+
def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
|
35
|
+
data_files = []
|
36
|
+
for file_path in paths:
|
37
|
+
partition_dir = file_path.split("/")[-2]
|
38
|
+
partition_value = int(partition_dir.split("=")[-1])
|
39
|
+
fs_tuple = FileSystem.from_uri(file_path)
|
40
|
+
fs = fs_tuple[0]
|
41
|
+
fs_path = fs_tuple[1]
|
42
|
+
with fs.open_input_file(fs_path) as native_file:
|
43
|
+
parquet_metadata = pq.read_metadata(native_file)
|
44
|
+
data_file = DataFile(
|
45
|
+
content=DataFileContent.DATA,
|
46
|
+
file_path=file_path,
|
47
|
+
file_format=FileFormat.PARQUET,
|
48
|
+
partition=Record(
|
49
|
+
**{
|
50
|
+
"struct": StructType(
|
51
|
+
NestedField(
|
52
|
+
0,
|
53
|
+
table.spec().fields[0].name,
|
54
|
+
IntegerType(),
|
55
|
+
required=False,
|
56
|
+
)
|
57
|
+
),
|
58
|
+
**{table.spec().fields[0].name: partition_value},
|
59
|
+
}
|
60
|
+
),
|
61
|
+
file_size_in_bytes=native_file.size(),
|
62
|
+
sort_order_id=None,
|
63
|
+
spec_id=table.spec().spec_id,
|
64
|
+
equality_ids=None,
|
65
|
+
key_metadata=None,
|
66
|
+
)
|
67
|
+
fill_parquet_file_metadata(
|
68
|
+
data_file=data_file,
|
69
|
+
parquet_metadata=parquet_metadata,
|
70
|
+
stats_columns=compute_statistics_plan(table.schema(), table.properties),
|
71
|
+
parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
|
72
|
+
)
|
73
|
+
data_files.append(data_file)
|
74
|
+
return data_files
|