deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import daft
|
5
|
+
import deltacat as dc
|
6
|
+
|
7
|
+
from deltacat import logs
|
8
|
+
from deltacat import IcebergCatalog
|
9
|
+
from deltacat.examples.common.fixtures import (
|
10
|
+
store_cli_args_in_os_environ,
|
11
|
+
)
|
12
|
+
|
13
|
+
from pyiceberg.schema import (
|
14
|
+
Schema,
|
15
|
+
NestedField,
|
16
|
+
DoubleType,
|
17
|
+
StringType,
|
18
|
+
)
|
19
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
20
|
+
from pyiceberg.transforms import BucketTransform
|
21
|
+
|
22
|
+
from deltacat.storage.iceberg.model import (
|
23
|
+
SchemaMapper,
|
24
|
+
PartitionSchemeMapper,
|
25
|
+
)
|
26
|
+
from deltacat.env import create_ray_runtime_environment
|
27
|
+
|
28
|
+
# initialize the driver logger
|
29
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
30
|
+
|
31
|
+
|
32
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
33
|
+
# create any runtime environment required to run the example
|
34
|
+
runtime_env = create_ray_runtime_environment()
|
35
|
+
|
36
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
37
|
+
# Ray will be initialized automatically via `ray.init()`.
|
38
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
39
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
40
|
+
# to specify which catalog should be the default.
|
41
|
+
dc.init(
|
42
|
+
catalogs={
|
43
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
44
|
+
"iceberg": dc.Catalog(
|
45
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
46
|
+
impl=IcebergCatalog,
|
47
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
48
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
49
|
+
name="example-iceberg-catalog",
|
50
|
+
# for additional properties see:
|
51
|
+
# https://py.iceberg.apache.org/configuration/
|
52
|
+
properties={
|
53
|
+
"type": "glue",
|
54
|
+
"region_name": "us-east-1",
|
55
|
+
"warehouse": warehouse,
|
56
|
+
},
|
57
|
+
)
|
58
|
+
},
|
59
|
+
# pass the runtime environment into ray.init()
|
60
|
+
ray_init_args={"runtime_env": runtime_env},
|
61
|
+
)
|
62
|
+
|
63
|
+
# define a native Iceberg table schema
|
64
|
+
schema = Schema(
|
65
|
+
NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
|
66
|
+
NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
|
67
|
+
NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
|
68
|
+
)
|
69
|
+
|
70
|
+
# define a native Iceberg partition spec
|
71
|
+
partition_spec = PartitionSpec(
|
72
|
+
PartitionField(
|
73
|
+
source_id=1,
|
74
|
+
field_id=1000,
|
75
|
+
transform=BucketTransform(2),
|
76
|
+
name="symbol_bucket",
|
77
|
+
)
|
78
|
+
)
|
79
|
+
|
80
|
+
# define a native Iceberg sort order
|
81
|
+
# sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
|
82
|
+
|
83
|
+
# define the Daft dataframe to write
|
84
|
+
df = daft.from_pydict(
|
85
|
+
{
|
86
|
+
"symbol": ["amzn", "goog", "meta", "msft"],
|
87
|
+
"bid": [157.16, 150.55, 392.03, 403.25],
|
88
|
+
"ask": [157.17, 150.56, 392.09, 403.27],
|
89
|
+
}
|
90
|
+
)
|
91
|
+
|
92
|
+
# write to a table named `test_namespace.test_table_bucketed`
|
93
|
+
# we don't need to specify which catalog to create this table in since
|
94
|
+
# only the "iceberg" catalog is available
|
95
|
+
table_name = "test_table_bucketed"
|
96
|
+
namespace = "test_namespace"
|
97
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
98
|
+
dc.write_to_table(
|
99
|
+
data=df,
|
100
|
+
# path=warehouse + "/datafiles",
|
101
|
+
table=table_name,
|
102
|
+
namespace=namespace,
|
103
|
+
schema=SchemaMapper.map(schema),
|
104
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
105
|
+
# sort_keys=SortSchemeMapper.map(sort_order, schema),
|
106
|
+
)
|
107
|
+
|
108
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
109
|
+
table_definition = dc.get_table(table_name, namespace)
|
110
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
111
|
+
|
112
|
+
|
113
|
+
if __name__ == "__main__":
|
114
|
+
example_script_args = [
|
115
|
+
(
|
116
|
+
[
|
117
|
+
"--warehouse",
|
118
|
+
],
|
119
|
+
{
|
120
|
+
"help": "S3 path for Iceberg file storage.",
|
121
|
+
"type": str,
|
122
|
+
},
|
123
|
+
),
|
124
|
+
(
|
125
|
+
[
|
126
|
+
"--STAGE",
|
127
|
+
],
|
128
|
+
{
|
129
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
130
|
+
"type": str,
|
131
|
+
},
|
132
|
+
),
|
133
|
+
]
|
134
|
+
|
135
|
+
# store any CLI args in the runtime environment
|
136
|
+
store_cli_args_in_os_environ(example_script_args)
|
137
|
+
|
138
|
+
# run the example using os.environ as kwargs
|
139
|
+
run(**os.environ)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import deltacat as dc
|
4
|
+
|
5
|
+
from deltacat import logs
|
6
|
+
from deltacat import IcebergCatalog
|
7
|
+
from deltacat.examples.common.fixtures import (
|
8
|
+
store_cli_args_in_os_environ,
|
9
|
+
)
|
10
|
+
|
11
|
+
from pyiceberg.schema import (
|
12
|
+
Schema,
|
13
|
+
NestedField,
|
14
|
+
DoubleType,
|
15
|
+
StringType,
|
16
|
+
TimestampType,
|
17
|
+
FloatType,
|
18
|
+
StructType,
|
19
|
+
)
|
20
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
21
|
+
from pyiceberg.transforms import DayTransform, IdentityTransform
|
22
|
+
from pyiceberg.table.sorting import SortField, SortOrder
|
23
|
+
|
24
|
+
from deltacat.exceptions import TableAlreadyExistsError
|
25
|
+
from deltacat.storage.iceberg.model import (
|
26
|
+
SchemaMapper,
|
27
|
+
PartitionSchemeMapper,
|
28
|
+
SortSchemeMapper,
|
29
|
+
)
|
30
|
+
from deltacat.env import create_ray_runtime_environment
|
31
|
+
|
32
|
+
# initialize the driver logger
|
33
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
34
|
+
|
35
|
+
|
36
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
37
|
+
# create any runtime environment required to run the example
|
38
|
+
runtime_env = create_ray_runtime_environment()
|
39
|
+
|
40
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
41
|
+
# Ray will be initialized automatically via `ray.init()`.
|
42
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
43
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
44
|
+
# to specify which catalog should be the default.
|
45
|
+
dc.init(
|
46
|
+
catalogs={
|
47
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
48
|
+
"iceberg": dc.Catalog(
|
49
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
50
|
+
impl=IcebergCatalog,
|
51
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
52
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
53
|
+
name="example-iceberg-catalog",
|
54
|
+
# for additional properties see:
|
55
|
+
# https://py.iceberg.apache.org/configuration/
|
56
|
+
properties={
|
57
|
+
"type": "glue",
|
58
|
+
"region_name": "us-east-1",
|
59
|
+
"warehouse": warehouse,
|
60
|
+
},
|
61
|
+
)
|
62
|
+
},
|
63
|
+
# pass the runtime environment into ray.init()
|
64
|
+
ray_init_args={"runtime_env": runtime_env},
|
65
|
+
)
|
66
|
+
|
67
|
+
# define a native Iceberg table schema
|
68
|
+
schema = Schema(
|
69
|
+
NestedField(
|
70
|
+
field_id=1, name="datetime", field_type=TimestampType(), required=True
|
71
|
+
),
|
72
|
+
NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
|
73
|
+
NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
|
74
|
+
NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
|
75
|
+
NestedField(
|
76
|
+
field_id=5,
|
77
|
+
name="details",
|
78
|
+
field_type=StructType(
|
79
|
+
NestedField(
|
80
|
+
field_id=6,
|
81
|
+
name="created_by",
|
82
|
+
field_type=StringType(),
|
83
|
+
required=False,
|
84
|
+
),
|
85
|
+
),
|
86
|
+
required=False,
|
87
|
+
),
|
88
|
+
)
|
89
|
+
|
90
|
+
# define a native Iceberg partition spec
|
91
|
+
partition_spec = PartitionSpec(
|
92
|
+
PartitionField(
|
93
|
+
source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
|
94
|
+
)
|
95
|
+
)
|
96
|
+
|
97
|
+
# define a native Iceberg sort order
|
98
|
+
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
|
99
|
+
|
100
|
+
# create a table named `test_namespace.test_table`
|
101
|
+
# we don't need to specify which catalog to create this table in since
|
102
|
+
# only the "iceberg" catalog is available
|
103
|
+
table_name = "test_table"
|
104
|
+
namespace = "test_namespace"
|
105
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
106
|
+
try:
|
107
|
+
table_definition = dc.create_table(
|
108
|
+
table=table_name,
|
109
|
+
namespace=namespace,
|
110
|
+
schema=SchemaMapper.map(schema),
|
111
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
112
|
+
sort_keys=SortSchemeMapper.map(sort_order, schema),
|
113
|
+
)
|
114
|
+
print(f"Created Glue Table: {table_definition}")
|
115
|
+
except TableAlreadyExistsError:
|
116
|
+
print(f"Glue Table `{namespace}.{table_name}` already exists.")
|
117
|
+
|
118
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
119
|
+
table_definition = dc.get_table(table_name, namespace)
|
120
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
121
|
+
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
example_script_args = [
|
125
|
+
(
|
126
|
+
[
|
127
|
+
"--warehouse",
|
128
|
+
],
|
129
|
+
{
|
130
|
+
"help": "S3 path for Iceberg file storage.",
|
131
|
+
"type": str,
|
132
|
+
},
|
133
|
+
),
|
134
|
+
(
|
135
|
+
[
|
136
|
+
"--STAGE",
|
137
|
+
],
|
138
|
+
{
|
139
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
140
|
+
"type": str,
|
141
|
+
},
|
142
|
+
),
|
143
|
+
]
|
144
|
+
|
145
|
+
# store any CLI args in the runtime environment
|
146
|
+
store_cli_args_in_os_environ(example_script_args)
|
147
|
+
|
148
|
+
# run the example using os.environ as kwargs
|
149
|
+
run(**os.environ)
|
deltacat/exceptions.py
CHANGED
@@ -1,10 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from enum import Enum
|
3
|
-
import
|
4
|
-
import ray
|
3
|
+
from typing import Callable
|
5
4
|
import logging
|
5
|
+
|
6
6
|
import tenacity
|
7
|
-
|
7
|
+
|
8
|
+
from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
|
9
|
+
|
10
|
+
import botocore
|
11
|
+
from botocore.exceptions import BotoCoreError
|
12
|
+
|
13
|
+
import ray
|
8
14
|
from ray.exceptions import (
|
9
15
|
RayError,
|
10
16
|
RayTaskError,
|
@@ -13,14 +19,14 @@ from ray.exceptions import (
|
|
13
19
|
NodeDiedError,
|
14
20
|
OutOfMemoryError,
|
15
21
|
)
|
16
|
-
|
17
|
-
from
|
18
|
-
|
19
|
-
|
22
|
+
|
23
|
+
from daft.exceptions import DaftTransientError, DaftCoreException
|
24
|
+
|
25
|
+
import deltacat as dc
|
26
|
+
from deltacat import logs
|
20
27
|
from deltacat.utils.ray_utils.runtime import (
|
21
28
|
get_current_ray_task_id,
|
22
29
|
)
|
23
|
-
from daft.exceptions import DaftTransientError, DaftCoreException
|
24
30
|
|
25
31
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
32
|
|
@@ -64,6 +70,14 @@ class DeltaCatErrorNames(str, Enum):
|
|
64
70
|
UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
|
65
71
|
UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
|
66
72
|
|
73
|
+
NAMESPACE_NOT_FOUND_ERROR = "NamespaceNotFoundError"
|
74
|
+
TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
|
75
|
+
TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
|
76
|
+
STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
|
77
|
+
DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
|
78
|
+
TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
|
79
|
+
NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
|
80
|
+
|
67
81
|
|
68
82
|
class DeltaCatError(Exception):
|
69
83
|
def __init__(self, *args, **kwargs):
|
@@ -206,6 +220,34 @@ class UnrecognizedRayTaskError(NonRetryableError):
|
|
206
220
|
error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
|
207
221
|
|
208
222
|
|
223
|
+
class NamespaceNotFoundError(NonRetryableError):
|
224
|
+
error_name = DeltaCatErrorNames.NAMESPACE_NOT_FOUND_ERROR.value
|
225
|
+
|
226
|
+
|
227
|
+
class TableNotFoundError(NonRetryableError):
|
228
|
+
error_name = DeltaCatErrorNames.TABLE_NOT_FOUND_ERROR.value
|
229
|
+
|
230
|
+
|
231
|
+
class TableVersionNotFoundError(NonRetryableError):
|
232
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
|
233
|
+
|
234
|
+
|
235
|
+
class StreamNotFoundError(NonRetryableError):
|
236
|
+
error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
|
237
|
+
|
238
|
+
|
239
|
+
class DeltaNotFoundError(NonRetryableError):
|
240
|
+
error_name = DeltaCatErrorNames.DELTA_NOT_FOUND_ERROR.value
|
241
|
+
|
242
|
+
|
243
|
+
class TableAlreadyExistsError(NonRetryableError):
|
244
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
245
|
+
|
246
|
+
|
247
|
+
class NamespaceAlreadyExistsError(NonRetryableError):
|
248
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
249
|
+
|
250
|
+
|
209
251
|
def categorize_errors(func: Callable):
|
210
252
|
def wrapper(*args, **kwargs):
|
211
253
|
try:
|
@@ -238,7 +280,7 @@ def categorize_errors(func: Callable):
|
|
238
280
|
|
239
281
|
def categorize_deltacat_exception(
|
240
282
|
e: BaseException,
|
241
|
-
deltacat_storage:
|
283
|
+
deltacat_storage: dc.storage.interface = None,
|
242
284
|
deltacat_storage_kwargs: dict = None,
|
243
285
|
):
|
244
286
|
if deltacat_storage_kwargs is None:
|
deltacat/logs.py
CHANGED
@@ -18,6 +18,7 @@ from deltacat.constants import (
|
|
18
18
|
DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
|
19
19
|
DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
|
20
20
|
DELTACAT_LOGGER_CONTEXT,
|
21
|
+
DELTACAT_LOGGER_USE_SINGLE_HANDLER,
|
21
22
|
)
|
22
23
|
|
23
24
|
DEFAULT_LOG_LEVEL = "INFO"
|
@@ -226,6 +227,7 @@ def _configure_logger(
|
|
226
227
|
# This maintains log level of rotating file handlers
|
227
228
|
primary_log_level = log_level
|
228
229
|
logger.propagate = False
|
230
|
+
needs_handler = True
|
229
231
|
if log_level <= logging.getLevelName("DEBUG"):
|
230
232
|
if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
|
231
233
|
handler = _create_rotating_file_handler(
|
@@ -235,8 +237,9 @@ def _configure_logger(
|
|
235
237
|
context_kwargs=context_kwargs,
|
236
238
|
)
|
237
239
|
_add_logger_handler(logger, handler)
|
240
|
+
needs_handler = not DELTACAT_LOGGER_USE_SINGLE_HANDLER
|
238
241
|
primary_log_level = logging.getLevelName("INFO")
|
239
|
-
if not _file_handler_exists(logger, log_dir, log_base_file_name):
|
242
|
+
if not _file_handler_exists(logger, log_dir, log_base_file_name) and needs_handler:
|
240
243
|
handler = _create_rotating_file_handler(
|
241
244
|
log_dir,
|
242
245
|
log_base_file_name,
|
deltacat/storage/__init__.py
CHANGED
@@ -1,24 +1,63 @@
|
|
1
|
-
from deltacat.
|
1
|
+
from deltacat.storage.model.manifest import (
|
2
|
+
EntryType,
|
3
|
+
EntryParams,
|
2
4
|
Manifest,
|
3
5
|
ManifestAuthor,
|
4
6
|
ManifestEntry,
|
5
7
|
ManifestEntryList,
|
6
8
|
ManifestMeta,
|
7
9
|
)
|
8
|
-
from deltacat.storage.model.delta import
|
10
|
+
from deltacat.storage.model.delta import (
|
11
|
+
Delta,
|
12
|
+
DeltaLocator,
|
13
|
+
DeltaProperties,
|
14
|
+
)
|
9
15
|
from deltacat.storage.model.list_result import ListResult
|
10
16
|
from deltacat.storage.model.locator import Locator
|
11
|
-
from deltacat.storage.model.
|
12
|
-
|
13
|
-
|
14
|
-
from deltacat.storage.model.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
from deltacat.storage.model.metafile import (
|
18
|
+
Metafile,
|
19
|
+
)
|
20
|
+
from deltacat.storage.model.transaction import (
|
21
|
+
TransactionOperation,
|
22
|
+
Transaction,
|
23
|
+
)
|
24
|
+
from deltacat.storage.model.namespace import (
|
25
|
+
Namespace,
|
26
|
+
NamespaceLocator,
|
27
|
+
NamespaceProperties,
|
28
|
+
)
|
29
|
+
from deltacat.storage.model.partition import (
|
30
|
+
Partition,
|
31
|
+
PartitionLocator,
|
32
|
+
PartitionLocatorAlias,
|
33
|
+
PartitionKey,
|
34
|
+
PartitionScheme,
|
35
|
+
PartitionSchemeList,
|
19
36
|
PartitionValues,
|
20
|
-
|
21
|
-
|
37
|
+
)
|
38
|
+
from deltacat.storage.model.schema import (
|
39
|
+
Field,
|
40
|
+
FieldId,
|
41
|
+
FieldLocator,
|
42
|
+
FieldName,
|
43
|
+
NestedFieldName,
|
44
|
+
Schema,
|
45
|
+
SchemaList,
|
46
|
+
)
|
47
|
+
from deltacat.storage.model.stream import (
|
48
|
+
Stream,
|
49
|
+
StreamLocator,
|
50
|
+
StreamLocatorAlias,
|
51
|
+
)
|
52
|
+
from deltacat.storage.model.table import (
|
53
|
+
Table,
|
54
|
+
TableLocator,
|
55
|
+
TableProperties,
|
56
|
+
)
|
57
|
+
from deltacat.storage.model.table_version import (
|
58
|
+
TableVersion,
|
59
|
+
TableVersionLocator,
|
60
|
+
TableVersionProperties,
|
22
61
|
)
|
23
62
|
from deltacat.storage.model.transform import (
|
24
63
|
Transform,
|
@@ -26,9 +65,17 @@ from deltacat.storage.model.transform import (
|
|
26
65
|
TransformParameters,
|
27
66
|
BucketingStrategy,
|
28
67
|
BucketTransformParameters,
|
29
|
-
|
68
|
+
TruncateTransformParameters,
|
69
|
+
BucketTransform,
|
70
|
+
IdentityTransform,
|
71
|
+
VoidTransform,
|
72
|
+
UnknownTransform,
|
73
|
+
HourTransform,
|
74
|
+
DayTransform,
|
75
|
+
MonthTransform,
|
76
|
+
YearTransform,
|
77
|
+
TruncateTransform,
|
30
78
|
)
|
31
|
-
|
32
79
|
from deltacat.storage.model.types import (
|
33
80
|
CommitState,
|
34
81
|
DeltaType,
|
@@ -36,18 +83,39 @@ from deltacat.storage.model.types import (
|
|
36
83
|
LifecycleState,
|
37
84
|
LocalDataset,
|
38
85
|
LocalTable,
|
86
|
+
NullOrder,
|
39
87
|
SchemaConsistencyType,
|
88
|
+
StreamFormat,
|
89
|
+
SortOrder,
|
90
|
+
TransactionType,
|
91
|
+
TransactionOperationType,
|
92
|
+
)
|
93
|
+
from deltacat.storage.model.sort_key import (
|
94
|
+
SortKey,
|
95
|
+
SortScheme,
|
96
|
+
SortSchemeList,
|
40
97
|
)
|
41
|
-
from deltacat.storage.
|
98
|
+
from deltacat.storage.main import impl as metastore
|
42
99
|
|
43
100
|
__all__ = [
|
101
|
+
"BucketingStrategy",
|
102
|
+
"BucketTransform",
|
103
|
+
"BucketTransformParameters",
|
44
104
|
"CommitState",
|
105
|
+
"DayTransform",
|
45
106
|
"Delta",
|
46
107
|
"DeltaLocator",
|
47
|
-
"
|
48
|
-
"DeleteParameters",
|
108
|
+
"DeltaProperties",
|
49
109
|
"DeltaType",
|
50
110
|
"DistributedDataset",
|
111
|
+
"EntryType",
|
112
|
+
"EntryParams",
|
113
|
+
"Field",
|
114
|
+
"FieldId",
|
115
|
+
"FieldLocator",
|
116
|
+
"FieldName",
|
117
|
+
"HourTransform",
|
118
|
+
"IdentityTransform",
|
51
119
|
"LifecycleState",
|
52
120
|
"ListResult",
|
53
121
|
"LocalDataset",
|
@@ -56,28 +124,50 @@ __all__ = [
|
|
56
124
|
"Manifest",
|
57
125
|
"ManifestAuthor",
|
58
126
|
"ManifestEntry",
|
59
|
-
"ManifestMeta",
|
60
127
|
"ManifestEntryList",
|
128
|
+
"ManifestMeta",
|
129
|
+
"Metafile",
|
130
|
+
"metastore",
|
131
|
+
"MonthTransform",
|
61
132
|
"Namespace",
|
62
133
|
"NamespaceLocator",
|
134
|
+
"NamespaceProperties",
|
135
|
+
"NestedFieldName",
|
136
|
+
"NullOrder",
|
137
|
+
"Partition",
|
138
|
+
"PartitionKey",
|
63
139
|
"PartitionLocator",
|
64
|
-
"
|
140
|
+
"PartitionLocatorAlias",
|
141
|
+
"PartitionScheme",
|
142
|
+
"PartitionSchemeList",
|
143
|
+
"PartitionValues",
|
144
|
+
"Schema",
|
145
|
+
"SchemaList",
|
65
146
|
"SchemaConsistencyType",
|
147
|
+
"SortKey",
|
148
|
+
"SortOrder",
|
149
|
+
"SortScheme",
|
150
|
+
"SortSchemeList",
|
151
|
+
"Stream",
|
152
|
+
"StreamFormat",
|
66
153
|
"StreamLocator",
|
154
|
+
"StreamLocatorAlias",
|
67
155
|
"Table",
|
68
156
|
"TableLocator",
|
157
|
+
"TableProperties",
|
69
158
|
"TableVersion",
|
70
159
|
"TableVersionLocator",
|
71
|
-
"
|
72
|
-
"
|
73
|
-
"
|
74
|
-
"
|
75
|
-
"
|
76
|
-
"StreamPartitionSpec",
|
160
|
+
"TableVersionProperties",
|
161
|
+
"Transaction",
|
162
|
+
"TransactionOperation",
|
163
|
+
"TransactionOperationType",
|
164
|
+
"TransactionType",
|
77
165
|
"Transform",
|
78
166
|
"TransformName",
|
79
167
|
"TransformParameters",
|
80
|
-
"
|
81
|
-
"
|
82
|
-
"
|
168
|
+
"TruncateTransform",
|
169
|
+
"TruncateTransformParameters",
|
170
|
+
"UnknownTransform",
|
171
|
+
"VoidTransform",
|
172
|
+
"YearTransform",
|
83
173
|
]
|
File without changes
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from pyiceberg.catalog import Catalog
|
4
|
+
from deltacat.storage.model.scan.push_down import Pushdown
|
5
|
+
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
6
|
+
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
7
|
+
from deltacat.storage.util.scan_planner import ScanPlanner
|
8
|
+
from deltacat.storage.iceberg.impl import _try_load_iceberg_table
|
9
|
+
|
10
|
+
|
11
|
+
class IcebergScanPlanner(ScanPlanner):
|
12
|
+
def __init__(self, catalog: Catalog):
|
13
|
+
self.catalog = catalog
|
14
|
+
|
15
|
+
def create_scan_plan(
|
16
|
+
self,
|
17
|
+
table_name: str,
|
18
|
+
namespace: Optional[str] = None,
|
19
|
+
pushdown: Optional[Pushdown] = None,
|
20
|
+
) -> ScanPlan:
|
21
|
+
iceberg_table = _try_load_iceberg_table(
|
22
|
+
self.catalog, namespace=namespace, table_name=table_name
|
23
|
+
)
|
24
|
+
file_scan_tasks = []
|
25
|
+
# TODO: implement predicate pushdown to Iceberg
|
26
|
+
for scan_task in iceberg_table.scan().plan_files():
|
27
|
+
file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
|
28
|
+
return ScanPlan(file_scan_tasks)
|