deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,101 @@
|
|
1
|
+
import os
|
2
|
+
import ray
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from deltacat import logs
|
6
|
+
from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
|
7
|
+
from deltacat.examples.common.fixtures import (
|
8
|
+
store_cli_args_in_os_environ,
|
9
|
+
)
|
10
|
+
from deltacat.env import create_ray_runtime_environment
|
11
|
+
|
12
|
+
# initialize the driver logger
|
13
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
@ray.remote
|
17
|
+
def logging_worker(var1, var2):
|
18
|
+
# for AWS Glue, worker loggers must be initialized within the worker process
|
19
|
+
worker_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
20
|
+
|
21
|
+
log_line_1 = f"Worker System Environment: {os.environ}"
|
22
|
+
print(
|
23
|
+
f"Writing DEBUG log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
|
24
|
+
)
|
25
|
+
worker_logger.debug(log_line_1)
|
26
|
+
|
27
|
+
log_line_2 = f"Worker Variable 1: {var1}"
|
28
|
+
print(
|
29
|
+
f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
|
30
|
+
)
|
31
|
+
worker_logger.info(log_line_2)
|
32
|
+
|
33
|
+
log_line_3 = f"Worker Variable 2: {var2}"
|
34
|
+
print(
|
35
|
+
f"Writing INFO log line from Worker to {DELTACAT_APP_LOG_DIR}: '{log_line_3}'"
|
36
|
+
)
|
37
|
+
worker_logger.info(log_line_3)
|
38
|
+
|
39
|
+
|
40
|
+
def run(var1="default1", var2="default2", **kwargs):
|
41
|
+
log_line_1 = f"Driver Variable 1: {var1}"
|
42
|
+
print(
|
43
|
+
f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_1}'"
|
44
|
+
)
|
45
|
+
driver_logger.info(log_line_1)
|
46
|
+
|
47
|
+
log_line_2 = f"Driver Variable 2: {var2}"
|
48
|
+
print(
|
49
|
+
f"Writing INFO log line from Driver to {DELTACAT_APP_LOG_DIR}: '{log_line_2}'"
|
50
|
+
)
|
51
|
+
driver_logger.info(log_line_2)
|
52
|
+
|
53
|
+
print("Starting worker...")
|
54
|
+
ray.get(logging_worker.remote(var1, var2))
|
55
|
+
print(
|
56
|
+
f"The driver is shutting down. Additional DeltaCAT system logs have been written to {DELTACAT_SYS_LOG_DIR}"
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
if __name__ == "__main__":
|
61
|
+
example_script_args = [
|
62
|
+
(
|
63
|
+
[
|
64
|
+
"--var1",
|
65
|
+
],
|
66
|
+
{
|
67
|
+
"help": "First argument to log.",
|
68
|
+
"type": str,
|
69
|
+
},
|
70
|
+
),
|
71
|
+
(
|
72
|
+
[
|
73
|
+
"--var2",
|
74
|
+
],
|
75
|
+
{
|
76
|
+
"help": "Second argument to log.",
|
77
|
+
"type": str,
|
78
|
+
},
|
79
|
+
),
|
80
|
+
(
|
81
|
+
[
|
82
|
+
"--STAGE",
|
83
|
+
],
|
84
|
+
{
|
85
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
86
|
+
"type": str,
|
87
|
+
},
|
88
|
+
),
|
89
|
+
]
|
90
|
+
|
91
|
+
# store any CLI args in the runtime environment
|
92
|
+
store_cli_args_in_os_environ(example_script_args)
|
93
|
+
|
94
|
+
# create any runtime environment required to run the example
|
95
|
+
runtime_env = create_ray_runtime_environment()
|
96
|
+
|
97
|
+
# initialize ray
|
98
|
+
ray.init(runtime_env=runtime_env)
|
99
|
+
|
100
|
+
# run the example using os.environ as kwargs
|
101
|
+
run(**os.environ)
|
File without changes
|
@@ -0,0 +1,15 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import argparse
|
4
|
+
from deltacat import logs
|
5
|
+
|
6
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
7
|
+
|
8
|
+
|
9
|
+
def store_cli_args_in_os_environ(script_args_list=[]):
|
10
|
+
parser = argparse.ArgumentParser()
|
11
|
+
for args, kwargs in script_args_list:
|
12
|
+
parser.add_argument(*args, **kwargs)
|
13
|
+
args = parser.parse_args()
|
14
|
+
print(f"Command Line Arguments: {args}")
|
15
|
+
os.environ.update(vars(args))
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import ray
|
2
|
+
import deltacat
|
3
|
+
import daft
|
4
|
+
import pyiceberg
|
5
|
+
|
6
|
+
|
7
|
+
def print_package_version_info():
|
8
|
+
print(f"DeltaCAT Version: {deltacat.__version__}")
|
9
|
+
print(f"PyIceberg Version: {pyiceberg.__version__}")
|
10
|
+
print(f"Ray Version: {ray.__version__}")
|
11
|
+
print(f"Daft Version: {daft.__version__}")
|
12
|
+
|
13
|
+
|
14
|
+
@ray.remote
|
15
|
+
def hello_worker():
|
16
|
+
print("Hello, Worker!")
|
17
|
+
print_package_version_info()
|
18
|
+
|
19
|
+
|
20
|
+
def run():
|
21
|
+
print("Hello, Driver!")
|
22
|
+
print_package_version_info()
|
23
|
+
hello_worker.remote()
|
24
|
+
|
25
|
+
|
26
|
+
if __name__ == "__main__":
|
27
|
+
run()
|
File without changes
|
@@ -0,0 +1,139 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import daft
|
5
|
+
import deltacat as dc
|
6
|
+
|
7
|
+
from deltacat import logs
|
8
|
+
from deltacat import IcebergCatalog
|
9
|
+
from deltacat.examples.common.fixtures import (
|
10
|
+
store_cli_args_in_os_environ,
|
11
|
+
)
|
12
|
+
|
13
|
+
from pyiceberg.schema import (
|
14
|
+
Schema,
|
15
|
+
NestedField,
|
16
|
+
DoubleType,
|
17
|
+
StringType,
|
18
|
+
)
|
19
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
20
|
+
from pyiceberg.transforms import BucketTransform
|
21
|
+
|
22
|
+
from deltacat.storage.iceberg.model import (
|
23
|
+
SchemaMapper,
|
24
|
+
PartitionSchemeMapper,
|
25
|
+
)
|
26
|
+
from deltacat.env import create_ray_runtime_environment
|
27
|
+
|
28
|
+
# initialize the driver logger
|
29
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
30
|
+
|
31
|
+
|
32
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
33
|
+
# create any runtime environment required to run the example
|
34
|
+
runtime_env = create_ray_runtime_environment()
|
35
|
+
|
36
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
37
|
+
# Ray will be initialized automatically via `ray.init()`.
|
38
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
39
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
40
|
+
# to specify which catalog should be the default.
|
41
|
+
dc.init(
|
42
|
+
catalogs={
|
43
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
44
|
+
"iceberg": dc.Catalog(
|
45
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
46
|
+
impl=IcebergCatalog,
|
47
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
48
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
49
|
+
name="example-iceberg-catalog",
|
50
|
+
# for additional properties see:
|
51
|
+
# https://py.iceberg.apache.org/configuration/
|
52
|
+
properties={
|
53
|
+
"type": "glue",
|
54
|
+
"region_name": "us-east-1",
|
55
|
+
"warehouse": warehouse,
|
56
|
+
},
|
57
|
+
)
|
58
|
+
},
|
59
|
+
# pass the runtime environment into ray.init()
|
60
|
+
ray_init_args={"runtime_env": runtime_env},
|
61
|
+
)
|
62
|
+
|
63
|
+
# define a native Iceberg table schema
|
64
|
+
schema = Schema(
|
65
|
+
NestedField(field_id=1, name="symbol", field_type=StringType(), required=True),
|
66
|
+
NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
|
67
|
+
NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
|
68
|
+
)
|
69
|
+
|
70
|
+
# define a native Iceberg partition spec
|
71
|
+
partition_spec = PartitionSpec(
|
72
|
+
PartitionField(
|
73
|
+
source_id=1,
|
74
|
+
field_id=1000,
|
75
|
+
transform=BucketTransform(2),
|
76
|
+
name="symbol_bucket",
|
77
|
+
)
|
78
|
+
)
|
79
|
+
|
80
|
+
# define a native Iceberg sort order
|
81
|
+
# sort_order = SortOrder(SortField(source_id=1, transform=IdentityTransform()))
|
82
|
+
|
83
|
+
# define the Daft dataframe to write
|
84
|
+
df = daft.from_pydict(
|
85
|
+
{
|
86
|
+
"symbol": ["amzn", "goog", "meta", "msft"],
|
87
|
+
"bid": [157.16, 150.55, 392.03, 403.25],
|
88
|
+
"ask": [157.17, 150.56, 392.09, 403.27],
|
89
|
+
}
|
90
|
+
)
|
91
|
+
|
92
|
+
# write to a table named `test_namespace.test_table_bucketed`
|
93
|
+
# we don't need to specify which catalog to create this table in since
|
94
|
+
# only the "iceberg" catalog is available
|
95
|
+
table_name = "test_table_bucketed"
|
96
|
+
namespace = "test_namespace"
|
97
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
98
|
+
dc.write_to_table(
|
99
|
+
data=df,
|
100
|
+
# path=warehouse + "/datafiles",
|
101
|
+
table=table_name,
|
102
|
+
namespace=namespace,
|
103
|
+
schema=SchemaMapper.map(schema),
|
104
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
105
|
+
# sort_keys=SortSchemeMapper.map(sort_order, schema),
|
106
|
+
)
|
107
|
+
|
108
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
109
|
+
table_definition = dc.get_table(table_name, namespace)
|
110
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
111
|
+
|
112
|
+
|
113
|
+
if __name__ == "__main__":
|
114
|
+
example_script_args = [
|
115
|
+
(
|
116
|
+
[
|
117
|
+
"--warehouse",
|
118
|
+
],
|
119
|
+
{
|
120
|
+
"help": "S3 path for Iceberg file storage.",
|
121
|
+
"type": str,
|
122
|
+
},
|
123
|
+
),
|
124
|
+
(
|
125
|
+
[
|
126
|
+
"--STAGE",
|
127
|
+
],
|
128
|
+
{
|
129
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
130
|
+
"type": str,
|
131
|
+
},
|
132
|
+
),
|
133
|
+
]
|
134
|
+
|
135
|
+
# store any CLI args in the runtime environment
|
136
|
+
store_cli_args_in_os_environ(example_script_args)
|
137
|
+
|
138
|
+
# run the example using os.environ as kwargs
|
139
|
+
run(**os.environ)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import deltacat as dc
|
4
|
+
|
5
|
+
from deltacat import logs
|
6
|
+
from deltacat import IcebergCatalog
|
7
|
+
from deltacat.examples.common.fixtures import (
|
8
|
+
store_cli_args_in_os_environ,
|
9
|
+
)
|
10
|
+
|
11
|
+
from pyiceberg.schema import (
|
12
|
+
Schema,
|
13
|
+
NestedField,
|
14
|
+
DoubleType,
|
15
|
+
StringType,
|
16
|
+
TimestampType,
|
17
|
+
FloatType,
|
18
|
+
StructType,
|
19
|
+
)
|
20
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
21
|
+
from pyiceberg.transforms import DayTransform, IdentityTransform
|
22
|
+
from pyiceberg.table.sorting import SortField, SortOrder
|
23
|
+
|
24
|
+
from deltacat.exceptions import TableAlreadyExistsError
|
25
|
+
from deltacat.storage.iceberg.model import (
|
26
|
+
SchemaMapper,
|
27
|
+
PartitionSchemeMapper,
|
28
|
+
SortSchemeMapper,
|
29
|
+
)
|
30
|
+
from deltacat.env import create_ray_runtime_environment
|
31
|
+
|
32
|
+
# initialize the driver logger
|
33
|
+
driver_logger = logs.configure_application_logger(logging.getLogger(__name__))
|
34
|
+
|
35
|
+
|
36
|
+
def run(warehouse="s3://my-bucket/my/key/prefix", **kwargs):
|
37
|
+
# create any runtime environment required to run the example
|
38
|
+
runtime_env = create_ray_runtime_environment()
|
39
|
+
|
40
|
+
# Start by initializing DeltaCAT and registering available Catalogs.
|
41
|
+
# Ray will be initialized automatically via `ray.init()`.
|
42
|
+
# Only the `iceberg` data catalog is provided so it will become the default.
|
43
|
+
# If initializing multiple catalogs, use the `default_catalog_name` param
|
44
|
+
# to specify which catalog should be the default.
|
45
|
+
dc.init(
|
46
|
+
catalogs={
|
47
|
+
# the name of the DeltaCAT catalog is "iceberg"
|
48
|
+
"iceberg": dc.Catalog(
|
49
|
+
# Apache Iceberg implementation of deltacat.catalog.interface
|
50
|
+
impl=IcebergCatalog,
|
51
|
+
# kwargs for pyiceberg.catalog.load_catalog start here...
|
52
|
+
# the name of the Iceberg catalog is "example-iceberg-catalog"
|
53
|
+
name="example-iceberg-catalog",
|
54
|
+
# for additional properties see:
|
55
|
+
# https://py.iceberg.apache.org/configuration/
|
56
|
+
properties={
|
57
|
+
"type": "glue",
|
58
|
+
"region_name": "us-east-1",
|
59
|
+
"warehouse": warehouse,
|
60
|
+
},
|
61
|
+
)
|
62
|
+
},
|
63
|
+
# pass the runtime environment into ray.init()
|
64
|
+
ray_init_args={"runtime_env": runtime_env},
|
65
|
+
)
|
66
|
+
|
67
|
+
# define a native Iceberg table schema
|
68
|
+
schema = Schema(
|
69
|
+
NestedField(
|
70
|
+
field_id=1, name="datetime", field_type=TimestampType(), required=True
|
71
|
+
),
|
72
|
+
NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
|
73
|
+
NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
|
74
|
+
NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
|
75
|
+
NestedField(
|
76
|
+
field_id=5,
|
77
|
+
name="details",
|
78
|
+
field_type=StructType(
|
79
|
+
NestedField(
|
80
|
+
field_id=6,
|
81
|
+
name="created_by",
|
82
|
+
field_type=StringType(),
|
83
|
+
required=False,
|
84
|
+
),
|
85
|
+
),
|
86
|
+
required=False,
|
87
|
+
),
|
88
|
+
)
|
89
|
+
|
90
|
+
# define a native Iceberg partition spec
|
91
|
+
partition_spec = PartitionSpec(
|
92
|
+
PartitionField(
|
93
|
+
source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
|
94
|
+
)
|
95
|
+
)
|
96
|
+
|
97
|
+
# define a native Iceberg sort order
|
98
|
+
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
|
99
|
+
|
100
|
+
# create a table named `test_namespace.test_table`
|
101
|
+
# we don't need to specify which catalog to create this table in since
|
102
|
+
# only the "iceberg" catalog is available
|
103
|
+
table_name = "test_table"
|
104
|
+
namespace = "test_namespace"
|
105
|
+
print(f"Creating Glue Table: {namespace}.{table_name}")
|
106
|
+
try:
|
107
|
+
table_definition = dc.create_table(
|
108
|
+
table=table_name,
|
109
|
+
namespace=namespace,
|
110
|
+
schema=SchemaMapper.map(schema),
|
111
|
+
partition_scheme=PartitionSchemeMapper.map(partition_spec, schema),
|
112
|
+
sort_keys=SortSchemeMapper.map(sort_order, schema),
|
113
|
+
)
|
114
|
+
print(f"Created Glue Table: {table_definition}")
|
115
|
+
except TableAlreadyExistsError:
|
116
|
+
print(f"Glue Table `{namespace}.{table_name}` already exists.")
|
117
|
+
|
118
|
+
print(f"Getting Glue Table: {namespace}.{table_name}")
|
119
|
+
table_definition = dc.get_table(table_name, namespace)
|
120
|
+
print(f"Retrieved Glue Table: {table_definition}")
|
121
|
+
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
example_script_args = [
|
125
|
+
(
|
126
|
+
[
|
127
|
+
"--warehouse",
|
128
|
+
],
|
129
|
+
{
|
130
|
+
"help": "S3 path for Iceberg file storage.",
|
131
|
+
"type": str,
|
132
|
+
},
|
133
|
+
),
|
134
|
+
(
|
135
|
+
[
|
136
|
+
"--STAGE",
|
137
|
+
],
|
138
|
+
{
|
139
|
+
"help": "Example runtime environment stage (e.g. dev, alpha, beta, prod).",
|
140
|
+
"type": str,
|
141
|
+
},
|
142
|
+
),
|
143
|
+
]
|
144
|
+
|
145
|
+
# store any CLI args in the runtime environment
|
146
|
+
store_cli_args_in_os_environ(example_script_args)
|
147
|
+
|
148
|
+
# run the example using os.environ as kwargs
|
149
|
+
run(**os.environ)
|
deltacat/exceptions.py
CHANGED
@@ -1,10 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from enum import Enum
|
3
|
-
import
|
4
|
-
import ray
|
3
|
+
from typing import Callable
|
5
4
|
import logging
|
5
|
+
|
6
6
|
import tenacity
|
7
|
-
|
7
|
+
|
8
|
+
from pyarrow.lib import ArrowException, ArrowInvalid, ArrowCapacityError
|
9
|
+
|
10
|
+
import botocore
|
11
|
+
from botocore.exceptions import BotoCoreError
|
12
|
+
|
13
|
+
import ray
|
8
14
|
from ray.exceptions import (
|
9
15
|
RayError,
|
10
16
|
RayTaskError,
|
@@ -13,14 +19,14 @@ from ray.exceptions import (
|
|
13
19
|
NodeDiedError,
|
14
20
|
OutOfMemoryError,
|
15
21
|
)
|
16
|
-
|
17
|
-
from
|
18
|
-
|
19
|
-
|
22
|
+
|
23
|
+
from daft.exceptions import DaftTransientError, DaftCoreException
|
24
|
+
|
25
|
+
import deltacat as dc
|
26
|
+
from deltacat import logs
|
20
27
|
from deltacat.utils.ray_utils.runtime import (
|
21
28
|
get_current_ray_task_id,
|
22
29
|
)
|
23
|
-
from daft.exceptions import DaftTransientError, DaftCoreException
|
24
30
|
|
25
31
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
32
|
|
@@ -64,6 +70,14 @@ class DeltaCatErrorNames(str, Enum):
|
|
64
70
|
UNCLASSIFIED_DELTACAT_ERROR = "UnclassifiedDeltaCatError"
|
65
71
|
UNRECOGNIZED_RAY_TASK_ERROR = "UnrecognizedRayTaskError"
|
66
72
|
|
73
|
+
NAMESPACE_NOT_FOUND_ERROR = "NamespaceNotFoundError"
|
74
|
+
TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
|
75
|
+
TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
|
76
|
+
STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
|
77
|
+
DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
|
78
|
+
TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
|
79
|
+
NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
|
80
|
+
|
67
81
|
|
68
82
|
class DeltaCatError(Exception):
|
69
83
|
def __init__(self, *args, **kwargs):
|
@@ -206,6 +220,34 @@ class UnrecognizedRayTaskError(NonRetryableError):
|
|
206
220
|
error_name = DeltaCatErrorNames.UNRECOGNIZED_RAY_TASK_ERROR.value
|
207
221
|
|
208
222
|
|
223
|
+
class NamespaceNotFoundError(NonRetryableError):
|
224
|
+
error_name = DeltaCatErrorNames.NAMESPACE_NOT_FOUND_ERROR.value
|
225
|
+
|
226
|
+
|
227
|
+
class TableNotFoundError(NonRetryableError):
|
228
|
+
error_name = DeltaCatErrorNames.TABLE_NOT_FOUND_ERROR.value
|
229
|
+
|
230
|
+
|
231
|
+
class TableVersionNotFoundError(NonRetryableError):
|
232
|
+
error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
|
233
|
+
|
234
|
+
|
235
|
+
class StreamNotFoundError(NonRetryableError):
|
236
|
+
error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
|
237
|
+
|
238
|
+
|
239
|
+
class DeltaNotFoundError(NonRetryableError):
|
240
|
+
error_name = DeltaCatErrorNames.DELTA_NOT_FOUND_ERROR.value
|
241
|
+
|
242
|
+
|
243
|
+
class TableAlreadyExistsError(NonRetryableError):
|
244
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
245
|
+
|
246
|
+
|
247
|
+
class NamespaceAlreadyExistsError(NonRetryableError):
|
248
|
+
error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
|
249
|
+
|
250
|
+
|
209
251
|
def categorize_errors(func: Callable):
|
210
252
|
def wrapper(*args, **kwargs):
|
211
253
|
try:
|
@@ -238,7 +280,7 @@ def categorize_errors(func: Callable):
|
|
238
280
|
|
239
281
|
def categorize_deltacat_exception(
|
240
282
|
e: BaseException,
|
241
|
-
deltacat_storage:
|
283
|
+
deltacat_storage: dc.storage.interface = None,
|
242
284
|
deltacat_storage_kwargs: dict = None,
|
243
285
|
):
|
244
286
|
if deltacat_storage_kwargs is None:
|
deltacat/logs.py
CHANGED
@@ -18,6 +18,7 @@ from deltacat.constants import (
|
|
18
18
|
DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
|
19
19
|
DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
|
20
20
|
DELTACAT_LOGGER_CONTEXT,
|
21
|
+
DELTACAT_LOGGER_USE_SINGLE_HANDLER,
|
21
22
|
)
|
22
23
|
|
23
24
|
DEFAULT_LOG_LEVEL = "INFO"
|
@@ -226,6 +227,7 @@ def _configure_logger(
|
|
226
227
|
# This maintains log level of rotating file handlers
|
227
228
|
primary_log_level = log_level
|
228
229
|
logger.propagate = False
|
230
|
+
needs_handler = True
|
229
231
|
if log_level <= logging.getLevelName("DEBUG"):
|
230
232
|
if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
|
231
233
|
handler = _create_rotating_file_handler(
|
@@ -235,8 +237,9 @@ def _configure_logger(
|
|
235
237
|
context_kwargs=context_kwargs,
|
236
238
|
)
|
237
239
|
_add_logger_handler(logger, handler)
|
240
|
+
needs_handler = not DELTACAT_LOGGER_USE_SINGLE_HANDLER
|
238
241
|
primary_log_level = logging.getLevelName("INFO")
|
239
|
-
if not _file_handler_exists(logger, log_dir, log_base_file_name):
|
242
|
+
if not _file_handler_exists(logger, log_dir, log_base_file_name) and needs_handler:
|
240
243
|
handler = _create_rotating_file_handler(
|
241
244
|
log_dir,
|
242
245
|
log_base_file_name,
|