deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +27 -6
- deltacat/api.py +478 -123
- deltacat/aws/s3u.py +2 -2
- deltacat/benchmarking/conftest.py +1 -1
- deltacat/catalog/main/impl.py +12 -6
- deltacat/catalog/model/catalog.py +65 -47
- deltacat/catalog/model/properties.py +1 -3
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +78 -36
- deltacat/compute/converter/model/convert_input.py +24 -4
- deltacat/compute/converter/model/convert_result.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +52 -10
- deltacat/compute/converter/pyiceberg/overrides.py +181 -62
- deltacat/compute/converter/steps/convert.py +84 -36
- deltacat/compute/converter/steps/dedupe.py +25 -4
- deltacat/compute/converter/utils/convert_task_options.py +42 -13
- deltacat/compute/converter/utils/iceberg_columns.py +5 -0
- deltacat/compute/converter/utils/io.py +82 -11
- deltacat/compute/converter/utils/s3u.py +13 -4
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +404 -0
- deltacat/constants.py +4 -4
- deltacat/daft/daft_scan.py +7 -3
- deltacat/daft/translator.py +126 -0
- deltacat/examples/basic_logging.py +5 -3
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +199 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +580 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +2 -0
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/metafile.py +74 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/types.py +5 -3
- deltacat/storage/rivulet/__init__.py +4 -4
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/compute/converter/test_convert_session.py +209 -46
- deltacat/tests/local_deltacat_storage/__init__.py +1 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
- deltacat/tests/storage/rivulet/test_dataset.py +1 -1
- deltacat/tests/storage/rivulet/test_manifest.py +1 -1
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
- deltacat/tests/test_deltacat_api.py +50 -9
- deltacat/types/media.py +141 -43
- deltacat/types/tables.py +35 -7
- deltacat/utils/daft.py +2 -2
- deltacat/utils/filesystem.py +39 -9
- deltacat/utils/polars.py +128 -0
- deltacat/utils/pyarrow.py +151 -15
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/url.py +1284 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
- {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,10 @@ from deltacat.exceptions import RetryableError
|
|
4
4
|
AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
|
5
5
|
AVERAGE_POS_COLUMN_SIZE_BYTES = 4
|
6
6
|
XXHASH_BYTE_PER_RECORD = 8
|
7
|
-
MEMORY_BUFFER_RATE =
|
7
|
+
MEMORY_BUFFER_RATE = 2
|
8
|
+
# TODO: Add audit info to check this number in practice
|
9
|
+
# Worst case 2 as no duplicates exists across all pk
|
10
|
+
PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
|
8
11
|
|
9
12
|
|
10
13
|
def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
|
@@ -13,8 +16,8 @@ def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_c
|
|
13
16
|
|
14
17
|
def get_total_record_from_iceberg_files(iceberg_files_list):
|
15
18
|
total_record_count = 0
|
16
|
-
|
17
|
-
|
19
|
+
# file are in form of tuple (sequence_number, DataFile)
|
20
|
+
total_record_count += sum(file[1].record_count for file in iceberg_files_list)
|
18
21
|
return total_record_count
|
19
22
|
|
20
23
|
|
@@ -68,7 +71,8 @@ def _get_task_options(
|
|
68
71
|
task_opts["resources"] = ray_custom_resources
|
69
72
|
|
70
73
|
task_opts["max_retries"] = 3
|
71
|
-
|
74
|
+
task_opts["num_cpus"] = 1
|
75
|
+
task_opts["resources"] = {"convert_task": 1}
|
72
76
|
# List of possible botocore exceptions are available at
|
73
77
|
# https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
|
74
78
|
task_opts["retry_exceptions"] = [RetryableError]
|
@@ -76,13 +80,38 @@ def _get_task_options(
|
|
76
80
|
return task_opts
|
77
81
|
|
78
82
|
|
79
|
-
def
|
80
|
-
(
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
data_files_list, equality_delete_files_list
|
83
|
+
def estimate_dedupe_memory(all_data_files_for_dedupe):
|
84
|
+
dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
|
85
|
+
produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
|
86
|
+
["file_path", "pos"], dedupe_record_count
|
87
|
+
)
|
88
|
+
download_pk_memory_required = estimate_fixed_hash_columns(
|
89
|
+
XXHASH_BYTE_PER_RECORD, dedupe_record_count
|
87
90
|
)
|
88
|
-
|
91
|
+
memory_required_by_dedupe = (
|
92
|
+
produced_pos_memory_required + download_pk_memory_required
|
93
|
+
) * PYARROW_AGGREGATE_MEMORY_MULTIPLIER
|
94
|
+
memory_with_buffer = memory_required_by_dedupe * MEMORY_BUFFER_RATE
|
95
|
+
return memory_with_buffer
|
96
|
+
|
97
|
+
|
98
|
+
def convert_resource_options_provider(index, convert_input_files):
|
99
|
+
applicable_data_files = convert_input_files.applicable_data_files
|
100
|
+
applicable_equality_delete_files = (
|
101
|
+
convert_input_files.applicable_equality_delete_files
|
102
|
+
)
|
103
|
+
all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
|
104
|
+
total_memory_required = 0
|
105
|
+
if applicable_data_files and applicable_equality_delete_files:
|
106
|
+
memory_requirement_for_convert_equality_deletes = (
|
107
|
+
estimate_convert_remote_option_resources(
|
108
|
+
applicable_data_files, applicable_equality_delete_files
|
109
|
+
)
|
110
|
+
)
|
111
|
+
total_memory_required += memory_requirement_for_convert_equality_deletes
|
112
|
+
if all_data_files_for_dedupe:
|
113
|
+
memory_requirement_for_dedupe = estimate_dedupe_memory(
|
114
|
+
all_data_files_for_dedupe
|
115
|
+
)
|
116
|
+
total_memory_required += memory_requirement_for_dedupe
|
117
|
+
return _get_task_options(memory=total_memory_required)
|
@@ -1,26 +1,43 @@
|
|
1
|
+
import logging
|
2
|
+
from deltacat import logs
|
1
3
|
import deltacat.compute.converter.utils.iceberg_columns as sc
|
2
4
|
import daft
|
5
|
+
from deltacat.utils.daft import _get_s3_io_config
|
6
|
+
from daft import TimeUnit
|
7
|
+
import pyarrow as pa
|
8
|
+
from deltacat.utils.pyarrow import sliced_string_cast
|
9
|
+
from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
|
10
|
+
|
11
|
+
import pyarrow.compute as pc
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
3
14
|
|
4
15
|
|
5
16
|
def download_data_table_and_append_iceberg_columns(
|
6
|
-
file,
|
17
|
+
file,
|
18
|
+
columns_to_download,
|
19
|
+
additional_columns_to_append,
|
20
|
+
sequence_number,
|
21
|
+
s3_client_kwargs,
|
7
22
|
):
|
8
|
-
# TODO; add S3 client kwargs
|
9
23
|
table = download_parquet_with_daft_hash_applied(
|
10
|
-
|
24
|
+
identifier_columns=columns_to_download,
|
25
|
+
file=file,
|
26
|
+
s3_client_kwargs=s3_client_kwargs,
|
11
27
|
)
|
28
|
+
|
12
29
|
if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
|
13
30
|
table = sc.append_file_path_column(table, file.file_path)
|
14
31
|
if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
|
15
32
|
record_idx_iterator = iter(range(len(table)))
|
16
33
|
table = sc.append_record_idx_col(table, record_idx_iterator)
|
34
|
+
|
17
35
|
return table
|
18
36
|
|
19
37
|
|
20
38
|
def download_parquet_with_daft_hash_applied(
|
21
|
-
|
39
|
+
identifier_columns, file, s3_client_kwargs, **kwargs
|
22
40
|
):
|
23
|
-
from daft import TimeUnit
|
24
41
|
|
25
42
|
# TODO: Add correct read kwargs as in:
|
26
43
|
# https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
|
@@ -29,15 +46,69 @@ def download_parquet_with_daft_hash_applied(
|
|
29
46
|
kwargs.get("coerce_int96_timestamp_unit", "ms")
|
30
47
|
)
|
31
48
|
|
32
|
-
from deltacat.utils.daft import _get_s3_io_config
|
33
|
-
|
34
49
|
# TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
|
35
50
|
io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
|
36
|
-
df =
|
51
|
+
df = daft_read_parquet(
|
37
52
|
path=file.file_path,
|
38
53
|
io_config=io_config,
|
39
54
|
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
40
55
|
)
|
41
|
-
|
42
|
-
|
43
|
-
|
56
|
+
|
57
|
+
hash_column = concatenate_hashed_identifier_columns(
|
58
|
+
df=df, identifier_columns=identifier_columns
|
59
|
+
)
|
60
|
+
|
61
|
+
table = pa.Table.from_arrays(
|
62
|
+
[hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
|
63
|
+
)
|
64
|
+
|
65
|
+
return table
|
66
|
+
|
67
|
+
|
68
|
+
def daft_read_parquet(path, io_config, coerce_int96_timestamp_unit):
|
69
|
+
df = daft.read_parquet(
|
70
|
+
path=path,
|
71
|
+
io_config=io_config,
|
72
|
+
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
|
73
|
+
)
|
74
|
+
return df
|
75
|
+
|
76
|
+
|
77
|
+
def concatenate_hashed_identifier_columns(df, identifier_columns):
|
78
|
+
pk_hash_columns = []
|
79
|
+
previous_hash_column_length = None
|
80
|
+
for i in range(len(identifier_columns)):
|
81
|
+
pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
|
82
|
+
pk_hash_column_arrow = pk_hash_column.to_arrow()
|
83
|
+
|
84
|
+
# Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
|
85
|
+
if not previous_hash_column_length:
|
86
|
+
previous_hash_column_length = len(pk_hash_column_arrow)
|
87
|
+
else:
|
88
|
+
assert previous_hash_column_length == len(pk_hash_column_arrow), (
|
89
|
+
f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
|
90
|
+
f"but expected {previous_hash_column_length}."
|
91
|
+
)
|
92
|
+
previous_hash_column_length = len(pk_hash_column_arrow)
|
93
|
+
|
94
|
+
# Convert identifier from different datatypes to string here
|
95
|
+
pk_hash_column_str = sliced_string_cast(
|
96
|
+
pk_hash_column_arrow[identifier_columns[i]]
|
97
|
+
)
|
98
|
+
assert len(pk_hash_column_str) == previous_hash_column_length, (
|
99
|
+
f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
|
100
|
+
f"before casting length: {previous_hash_column_length}."
|
101
|
+
)
|
102
|
+
|
103
|
+
pk_hash_columns.append(pk_hash_column_str)
|
104
|
+
|
105
|
+
pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
|
106
|
+
pk_hash_columns_concatenated = pc.binary_join_element_wise(
|
107
|
+
*pk_hash_columns, null_handling="replace"
|
108
|
+
)
|
109
|
+
assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
|
110
|
+
f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
|
111
|
+
f"before concatenating length: {previous_hash_column_length}."
|
112
|
+
)
|
113
|
+
|
114
|
+
return pk_hash_columns_concatenated
|
@@ -11,7 +11,7 @@ from deltacat.types.tables import (
|
|
11
11
|
get_table_length,
|
12
12
|
TABLE_CLASS_TO_SLICER_FUNC,
|
13
13
|
)
|
14
|
-
from typing import Optional, Dict, Any
|
14
|
+
from typing import Optional, Dict, Any
|
15
15
|
from deltacat.exceptions import RetryableError
|
16
16
|
from deltacat.storage import (
|
17
17
|
DistributedDataset,
|
@@ -59,7 +59,7 @@ def upload_table_with_retry(
|
|
59
59
|
max_records_per_file: Optional[int] = 4000000,
|
60
60
|
s3_file_system=None,
|
61
61
|
**s3_client_kwargs,
|
62
|
-
)
|
62
|
+
):
|
63
63
|
"""
|
64
64
|
Writes the given table to 1 or more S3 files and return Redshift
|
65
65
|
manifest entries describing the uploaded files.
|
@@ -77,7 +77,7 @@ def upload_table_with_retry(
|
|
77
77
|
s3_file_system = get_s3_file_system(content_type=content_type)
|
78
78
|
capture_object = CapturedBlockWritePaths()
|
79
79
|
block_write_path_provider = UuidBlockWritePathProvider(
|
80
|
-
capture_object=capture_object
|
80
|
+
capture_object=capture_object, base_path=s3_url_prefix
|
81
81
|
)
|
82
82
|
s3_table_writer_func = get_table_writer(table)
|
83
83
|
table_record_count = get_table_length(table)
|
@@ -110,7 +110,16 @@ def upload_table_with_retry(
|
|
110
110
|
)
|
111
111
|
del block_write_path_provider
|
112
112
|
write_paths = capture_object.write_paths()
|
113
|
-
|
113
|
+
s3_write_paths = []
|
114
|
+
for path in write_paths:
|
115
|
+
s3_write_path = construct_s3_url(path)
|
116
|
+
s3_write_paths.append(s3_write_path)
|
117
|
+
return s3_write_paths
|
118
|
+
|
119
|
+
|
120
|
+
def construct_s3_url(path):
|
121
|
+
if path:
|
122
|
+
return f"s3://{path}"
|
114
123
|
|
115
124
|
|
116
125
|
def upload_table(
|
File without changes
|
@@ -0,0 +1,404 @@
|
|
1
|
+
# from deltacat.compute import index
|
2
|
+
import subprocess
|
3
|
+
import socket
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
import re
|
7
|
+
|
8
|
+
import deltacat as dc
|
9
|
+
|
10
|
+
from dataclasses import dataclass
|
11
|
+
|
12
|
+
from typing import Set, Optional, Dict, Any, Union
|
13
|
+
|
14
|
+
from ray.job_submission import JobSubmissionClient, JobStatus
|
15
|
+
|
16
|
+
from deltacat.utils.performance import timed_invocation
|
17
|
+
|
18
|
+
|
19
|
+
def _run_cmd(cmd: str) -> None:
|
20
|
+
exit_code = int(os.system(cmd))
|
21
|
+
assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code}"
|
22
|
+
|
23
|
+
|
24
|
+
def _ray_up(cluster_cfg: str, restart_only: bool = False) -> None:
|
25
|
+
restart_flag = "--no-restart" if not restart_only else "--restart-only"
|
26
|
+
print(f"Starting Ray cluster from '{cluster_cfg}'")
|
27
|
+
_run_cmd(
|
28
|
+
f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} --disable-usage-stats"
|
29
|
+
)
|
30
|
+
print(f"Started Ray cluster from '{cluster_cfg}'")
|
31
|
+
|
32
|
+
|
33
|
+
def _is_port_in_use(port: Union[int, str]) -> bool:
|
34
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
35
|
+
return s.connect_ex(("localhost", int(port))) == 0
|
36
|
+
|
37
|
+
|
38
|
+
def _is_dashboard_running(port: Union[int, str]) -> bool:
|
39
|
+
return _is_port_in_use(port)
|
40
|
+
|
41
|
+
|
42
|
+
def _ray_dashboard_up(
|
43
|
+
cluster_cfg: str, port: Union[str, int], timeout_seconds=15
|
44
|
+
) -> None:
|
45
|
+
print(f"Starting Ray Dashboard for Ray cluster '{cluster_cfg}'")
|
46
|
+
_run_cmd(f"ray dashboard '{cluster_cfg}' --port {port} &")
|
47
|
+
start = time.monotonic()
|
48
|
+
dashboard_is_up = False
|
49
|
+
while time.monotonic() - start <= timeout_seconds:
|
50
|
+
if _is_dashboard_running(port):
|
51
|
+
dashboard_is_up = True
|
52
|
+
break
|
53
|
+
time.sleep(0.1)
|
54
|
+
if not dashboard_is_up:
|
55
|
+
raise TimeoutError(
|
56
|
+
f"Timed out after waiting {timeout_seconds} seconds for dashboard "
|
57
|
+
f"to establish connection on port {port}."
|
58
|
+
)
|
59
|
+
print(f"Started Ray Dashboard for Ray cluster '{cluster_cfg}'")
|
60
|
+
|
61
|
+
|
62
|
+
def _get_head_node_ip(cluster_cfg: str) -> str:
|
63
|
+
print(f"Getting Ray cluster head node IP for '{cluster_cfg}'")
|
64
|
+
cmd = f"ray get-head-ip '{cluster_cfg}'"
|
65
|
+
proc = subprocess.run(
|
66
|
+
cmd,
|
67
|
+
shell=True,
|
68
|
+
capture_output=True,
|
69
|
+
text=True,
|
70
|
+
check=True,
|
71
|
+
)
|
72
|
+
# the head node IP should be the last line printed to stdout
|
73
|
+
head_node_ip = proc.stdout.splitlines()[-1]
|
74
|
+
if not re.match(
|
75
|
+
r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
|
76
|
+
head_node_ip,
|
77
|
+
):
|
78
|
+
print(
|
79
|
+
f"Failed to find Ray Head Node IP Address in `{cmd}` "
|
80
|
+
f"output: {proc.stdout}"
|
81
|
+
)
|
82
|
+
raise RuntimeError("No Ray Head Node IP Address Found")
|
83
|
+
print(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
|
84
|
+
return head_node_ip
|
85
|
+
|
86
|
+
|
87
|
+
def _ray_down_cmd(cluster_cfg: str) -> str:
|
88
|
+
return f"ray down '{cluster_cfg}' -y"
|
89
|
+
|
90
|
+
|
91
|
+
def _ray_down(cluster_cfg: str) -> None:
|
92
|
+
print(f"Destroying Ray cluster for '{cluster_cfg}'")
|
93
|
+
_run_cmd(_ray_down_cmd(cluster_cfg))
|
94
|
+
print(f"Destroyed Ray cluster for '{cluster_cfg}'")
|
95
|
+
|
96
|
+
|
97
|
+
def _ray_cluster_running(cluster_cfg: str) -> bool:
|
98
|
+
try:
|
99
|
+
_get_head_node_ip(cluster_cfg)
|
100
|
+
except Exception as e:
|
101
|
+
print(f"Get Head Node IP Failed with Exception: {e}")
|
102
|
+
print(f"Assuming Ray Cluster is Not Running")
|
103
|
+
return False
|
104
|
+
return True
|
105
|
+
|
106
|
+
|
107
|
+
@dataclass(frozen=True)
|
108
|
+
class DeltaCatJobRunResult:
|
109
|
+
job_id: str
|
110
|
+
job_status: JobStatus
|
111
|
+
job_logs: Any
|
112
|
+
|
113
|
+
|
114
|
+
class DeltaCatJobClient(JobSubmissionClient):
|
115
|
+
@staticmethod
|
116
|
+
def of(
|
117
|
+
cluster_cfg_file_path: str = "./deltacat.yaml",
|
118
|
+
*,
|
119
|
+
launch_cluster: bool = True,
|
120
|
+
start_dashboard: bool = True,
|
121
|
+
restart_ray: bool = False,
|
122
|
+
head_node_ip: str = None,
|
123
|
+
dashboard_wait_time_seconds: int = 30,
|
124
|
+
port: Union[int, str] = "8265",
|
125
|
+
):
|
126
|
+
job_submission_client_url = None
|
127
|
+
try:
|
128
|
+
# launch Ray cluster if necessary
|
129
|
+
if cluster_cfg_file_path:
|
130
|
+
if launch_cluster:
|
131
|
+
if not _ray_cluster_running(cluster_cfg_file_path) or restart_ray:
|
132
|
+
_ray_up(cluster_cfg_file_path)
|
133
|
+
elif restart_ray:
|
134
|
+
if _ray_cluster_running(cluster_cfg_file_path):
|
135
|
+
_ray_up(cluster_cfg_file_path, restart_ray)
|
136
|
+
else:
|
137
|
+
raise RuntimeError(
|
138
|
+
f"Cannot Restart Ray: Ray Cluster for "
|
139
|
+
f"`{cluster_cfg_file_path}` not found."
|
140
|
+
)
|
141
|
+
dashboard_running = _is_dashboard_running(port)
|
142
|
+
if not dashboard_running and start_dashboard:
|
143
|
+
_ray_dashboard_up(
|
144
|
+
cluster_cfg=cluster_cfg_file_path,
|
145
|
+
port=port,
|
146
|
+
timeout_seconds=dashboard_wait_time_seconds,
|
147
|
+
)
|
148
|
+
dashboard_running = True
|
149
|
+
if not head_node_ip:
|
150
|
+
head_node_ip = (
|
151
|
+
"127.0.0.1"
|
152
|
+
# use dashboard port forwarding on localhost
|
153
|
+
if dashboard_running
|
154
|
+
# fetch the remote head node IP
|
155
|
+
else _get_head_node_ip(cluster_cfg_file_path)
|
156
|
+
)
|
157
|
+
else:
|
158
|
+
head_node_ip = "127.0.0.1"
|
159
|
+
job_submission_client_url = f"http://{head_node_ip}:{port}"
|
160
|
+
print(
|
161
|
+
f"Initializing Ray Job Submission Client with URL: "
|
162
|
+
f"{job_submission_client_url}"
|
163
|
+
)
|
164
|
+
client = JobSubmissionClient(f"http://{head_node_ip}:{port}")
|
165
|
+
# the below class change is safe as long as we only add new methods
|
166
|
+
# to the wrapped JobSubmissionClient that don't alter its internal
|
167
|
+
# state
|
168
|
+
client.__class__ = DeltaCatJobClient
|
169
|
+
return client
|
170
|
+
except Exception as e:
|
171
|
+
print(f"Unexpected error while initializing Ray Job Client: {e}")
|
172
|
+
if job_submission_client_url:
|
173
|
+
print(
|
174
|
+
f"Please ensure that Ray was installed with a job server "
|
175
|
+
f'enabled via `pip install -U "ray[default]"` and '
|
176
|
+
f"that http://{head_node_ip}:{port} is accessible. You "
|
177
|
+
f"can optionally run `ray dashboard` to forward the "
|
178
|
+
f"remote Ray head node port to a local port (default 8265) "
|
179
|
+
f'then run `ray_job_client("127.0.0.1", 8265)` '
|
180
|
+
f"to connect via localhost."
|
181
|
+
)
|
182
|
+
if cluster_cfg_file_path:
|
183
|
+
print(
|
184
|
+
f"If you're done submitting jobs, ensure that the remote "
|
185
|
+
f"Ray Cluster is shut down by running: "
|
186
|
+
f"{_ray_down_cmd(cluster_cfg_file_path)}"
|
187
|
+
)
|
188
|
+
raise e
|
189
|
+
|
190
|
+
def run_job(
|
191
|
+
self,
|
192
|
+
*,
|
193
|
+
entrypoint: str,
|
194
|
+
runtime_env: Optional[Dict[str, Any]] = None,
|
195
|
+
timeout_seconds: int = 600,
|
196
|
+
**kwargs,
|
197
|
+
) -> DeltaCatJobRunResult:
|
198
|
+
"""
|
199
|
+
Synchronously submit and run a Ray job. This method combines Ray job submission and monitoring by submitting
|
200
|
+
the job to the Ray Job Server, waiting for the job to complete,
|
201
|
+
validating the job's terminal status, retrieving and returning job run
|
202
|
+
result information if successful.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
entrypoint: The entry point for the job to be executed (module
|
206
|
+
or script to run)
|
207
|
+
runtime_env: Runtime environment configuration for the job.
|
208
|
+
Some commonly used keys include `working_dir` (directory
|
209
|
+
containing the job code), `pip` (list of pip packages to
|
210
|
+
install), and `env_vars` (environment variables for the job).
|
211
|
+
timeout_seconds: Maximum time in seconds to wait for job completion.
|
212
|
+
Default to 600 seconds (10 minutes).
|
213
|
+
kwargs: Additional keyword arguments to pass to the job submission.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
Final results from the successful job run execution.
|
217
|
+
|
218
|
+
Raises:
|
219
|
+
RuntimeError: If the job fails or terminates with status other
|
220
|
+
than SUCCEEDED.
|
221
|
+
TimeoutError: If the job doesn't complete within the specified
|
222
|
+
timeout period
|
223
|
+
|
224
|
+
Example:
|
225
|
+
>>> client = job_client()
|
226
|
+
>>> logs = client.run_job(
|
227
|
+
... # Shell command to run job
|
228
|
+
... entrypoint="my_script.py",
|
229
|
+
... runtime_env={
|
230
|
+
... # Path to the local directory containing my_script.py
|
231
|
+
... "working_dir": "./",
|
232
|
+
... # Pip dependencies to install
|
233
|
+
... "pip": ["pandas", "numpy"],
|
234
|
+
... # System environment variables to set
|
235
|
+
... "env_vars": {"DATA_PATH": "/path/to/data"},
|
236
|
+
... },
|
237
|
+
... timeout_seconds=1200
|
238
|
+
... )
|
239
|
+
"""
|
240
|
+
|
241
|
+
job_id = self.submit_job(
|
242
|
+
entrypoint=entrypoint,
|
243
|
+
runtime_env=runtime_env,
|
244
|
+
**kwargs,
|
245
|
+
)
|
246
|
+
job_status, latency = timed_invocation(
|
247
|
+
self.await_job,
|
248
|
+
job_id,
|
249
|
+
timeout_seconds=timeout_seconds,
|
250
|
+
)
|
251
|
+
job_logs = self.get_job_logs(job_id)
|
252
|
+
if job_status != JobStatus.SUCCEEDED:
|
253
|
+
print(f"Job `{job_id}` logs: ")
|
254
|
+
print(job_logs)
|
255
|
+
raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
|
256
|
+
return DeltaCatJobRunResult(
|
257
|
+
job_id=job_id,
|
258
|
+
job_status=job_status,
|
259
|
+
job_logs=job_logs,
|
260
|
+
)
|
261
|
+
|
262
|
+
def await_job(
|
263
|
+
self,
|
264
|
+
job_id: str,
|
265
|
+
await_status: Set[JobStatus] = {
|
266
|
+
JobStatus.SUCCEEDED,
|
267
|
+
JobStatus.STOPPED,
|
268
|
+
JobStatus.FAILED,
|
269
|
+
},
|
270
|
+
*,
|
271
|
+
timeout_seconds: int = 600,
|
272
|
+
) -> JobStatus:
|
273
|
+
"""
|
274
|
+
Polls a job's status until it matches the desired status or times out.
|
275
|
+
|
276
|
+
This function continuously checks the status of a specified job using the
|
277
|
+
provided client. It will keep polling until either the desired status is
|
278
|
+
reached or the timeout period expires.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
job_id: The unique identifier of the job to monitor.
|
282
|
+
await_status: Set of :class:`ray.job_submission.JobStatus` to wait for.
|
283
|
+
The function will return when the job reaches any of these states.
|
284
|
+
timeout_seconds: Maximum time to wait in seconds.
|
285
|
+
Defaults to 600 seconds (10 minutes).
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
The final status of the job.
|
289
|
+
|
290
|
+
Raises:
|
291
|
+
TimeoutError: If the desired status is not reached within the
|
292
|
+
specified timeout period.
|
293
|
+
|
294
|
+
Example:
|
295
|
+
>>>
|
296
|
+
>>> client = job_client()
|
297
|
+
>>> job_id = client.submit_job(
|
298
|
+
>>> # Shell command to run job
|
299
|
+
>>> entrypoint=f"python copy.py --source '{source}' --dest '{dest}'",
|
300
|
+
>>> # Path to the local directory containing copy.py
|
301
|
+
>>> runtime_env={"working_dir": "./"},
|
302
|
+
>>> )
|
303
|
+
>>> # wait for the job to reach a terminal state
|
304
|
+
>>> client.await_job(job_id)
|
305
|
+
"""
|
306
|
+
start = time.monotonic()
|
307
|
+
terminal_status = None
|
308
|
+
while time.monotonic() - start <= timeout_seconds:
|
309
|
+
status = self.get_job_status(job_id)
|
310
|
+
if status in await_status:
|
311
|
+
terminal_status = status
|
312
|
+
break
|
313
|
+
time.sleep(0.1)
|
314
|
+
if not terminal_status:
|
315
|
+
self.stop_job(job_id)
|
316
|
+
raise TimeoutError(
|
317
|
+
f"Timed out after waiting {timeout_seconds} seconds for job "
|
318
|
+
f"`{job_id}` status: {status}"
|
319
|
+
)
|
320
|
+
return terminal_status
|
321
|
+
|
322
|
+
|
323
|
+
def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
|
324
|
+
"""
|
325
|
+
Create a Ray Job Client that can be used to submit jobs to a local Ray
|
326
|
+
cluster. Initializes Ray if it's not already running.
|
327
|
+
|
328
|
+
Args:
|
329
|
+
*args: Positional arguments to pass to `ray.init()`.
|
330
|
+
**kwargs: Keyword arguments to pass to `ray.init()`.
|
331
|
+
Returns:
|
332
|
+
DeltaCatJobClient: A client instance that can be used to submit and
|
333
|
+
manage local Ray jobs.
|
334
|
+
|
335
|
+
Raises:
|
336
|
+
RuntimeError: If a local Ray Job Server cannot be found.
|
337
|
+
"""
|
338
|
+
if not dc.is_initialized():
|
339
|
+
context = dc.init(*args, **kwargs)
|
340
|
+
else:
|
341
|
+
context = dc.init(ray_init_args={"ignore_reinit_error": True})
|
342
|
+
if context.dashboard_url:
|
343
|
+
head_node_ip, port = context.dashboard_url.split(":")
|
344
|
+
else:
|
345
|
+
# the Ray Dashboard URL is also the Ray Job Server URL
|
346
|
+
raise RuntimeError(
|
347
|
+
"Ray Job Server not found! Please reinstall Ray using "
|
348
|
+
"`pip install -U `ray[default]`"
|
349
|
+
)
|
350
|
+
return DeltaCatJobClient.of(
|
351
|
+
None,
|
352
|
+
launch_cluster=False,
|
353
|
+
start_dashboard=False,
|
354
|
+
head_node_ip=head_node_ip,
|
355
|
+
port=port,
|
356
|
+
)
|
357
|
+
|
358
|
+
|
359
|
+
def job_client(
|
360
|
+
cluster_cfg_file_path: str = "./deltacat.yaml",
|
361
|
+
*,
|
362
|
+
launch_cluster: bool = True,
|
363
|
+
start_dashboard: bool = True,
|
364
|
+
restart_ray: bool = False,
|
365
|
+
head_node_ip: str = None,
|
366
|
+
dashboard_wait_time_seconds: int = 15,
|
367
|
+
port: Union[str, int] = "8265",
|
368
|
+
) -> DeltaCatJobClient:
|
369
|
+
"""
|
370
|
+
Create a DeltaCAT Job Client that can be used to submit jobs to a remote Ray cluster.
|
371
|
+
|
372
|
+
Args:
|
373
|
+
cluster_cfg_file_path: Path to the Ray Cluster Launcher
|
374
|
+
Config file. Defaults to "./deltacat.yaml".
|
375
|
+
launch_cluster : Whether to launch a new Ray cluster.
|
376
|
+
Defaults to True.
|
377
|
+
start_dashboard: Whether to start the Ray dashboard.
|
378
|
+
Defaults to True.
|
379
|
+
restart_ray: Whether to restart Ray if it's already
|
380
|
+
running. Defaults to False.
|
381
|
+
head_node_ip: IP address of the Ray cluster head node.
|
382
|
+
If None, will use the configuration from the cluster config file.
|
383
|
+
Defaults to None.
|
384
|
+
dashboard_wait_time_seconds: Time in seconds to wait for the Ray
|
385
|
+
dashboard to start if `start_dashboard` is True.
|
386
|
+
port: Port number for the Ray
|
387
|
+
dashboard/job server. Defaults to "8265".
|
388
|
+
|
389
|
+
Returns:
|
390
|
+
DeltaCatJobClient: A client instance that can be used to submit and
|
391
|
+
manage jobs on the Ray cluster.
|
392
|
+
|
393
|
+
Raises:
|
394
|
+
RuntimeError: If the Ray Job Server is not found.
|
395
|
+
"""
|
396
|
+
return DeltaCatJobClient.of(
|
397
|
+
cluster_cfg_file_path,
|
398
|
+
launch_cluster=launch_cluster,
|
399
|
+
start_dashboard=start_dashboard,
|
400
|
+
restart_ray=restart_ray,
|
401
|
+
head_node_ip=head_node_ip,
|
402
|
+
dashboard_wait_time_seconds=dashboard_wait_time_seconds,
|
403
|
+
port=port,
|
404
|
+
)
|
deltacat/constants.py
CHANGED
@@ -44,10 +44,9 @@ DELTACAT_ROOT = env_string(
|
|
44
44
|
)
|
45
45
|
|
46
46
|
# CLI Args
|
47
|
-
METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
|
48
47
|
METAFILE_FORMAT_JSON = "json"
|
49
48
|
METAFILE_FORMAT_MSGPACK = "msgpack"
|
50
|
-
METAFILE_FORMAT = env_string(
|
49
|
+
METAFILE_FORMAT = env_string("METAFILE_FORMAT", METAFILE_FORMAT_MSGPACK)
|
51
50
|
SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
|
52
51
|
METAFILE_EXT = {
|
53
52
|
"json": ".json",
|
@@ -95,11 +94,12 @@ RUNNING_TXN_DIR_NAME: str = "running"
|
|
95
94
|
FAILED_TXN_DIR_NAME: str = "failed"
|
96
95
|
SUCCESS_TXN_DIR_NAME: str = "success"
|
97
96
|
TXN_PART_SEPARATOR = "_"
|
97
|
+
|
98
98
|
# Storage interface defaults
|
99
99
|
# These defaults should be applied in catalog interface implementations
|
100
100
|
# Storage interface implementations should be agnostic to defaults and require full information
|
101
|
-
DEFAULT_CATALOG = "
|
102
|
-
DEFAULT_NAMESPACE = "
|
101
|
+
DEFAULT_CATALOG = "default"
|
102
|
+
DEFAULT_NAMESPACE = "default"
|
103
103
|
DEFAULT_TABLE_VERSION = "1"
|
104
104
|
DEFAULT_STREAM_ID = "stream"
|
105
105
|
DEFAULT_PARTITION_ID = "partition"
|