deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,10 @@ from deltacat.exceptions import RetryableError
4
4
  AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
5
5
  AVERAGE_POS_COLUMN_SIZE_BYTES = 4
6
6
  XXHASH_BYTE_PER_RECORD = 8
7
- MEMORY_BUFFER_RATE = 1.2
7
+ MEMORY_BUFFER_RATE = 2
8
+ # TODO: Add audit info to check this number in practice
9
+ # Worst case 2 as no duplicates exists across all pk
10
+ PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
8
11
 
9
12
 
10
13
  def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
@@ -13,8 +16,8 @@ def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_c
13
16
 
14
17
  def get_total_record_from_iceberg_files(iceberg_files_list):
15
18
  total_record_count = 0
16
- for iceberg_files in iceberg_files_list:
17
- total_record_count += sum(file.record_count for file in iceberg_files)
19
+ # file are in form of tuple (sequence_number, DataFile)
20
+ total_record_count += sum(file[1].record_count for file in iceberg_files_list)
18
21
  return total_record_count
19
22
 
20
23
 
@@ -68,7 +71,8 @@ def _get_task_options(
68
71
  task_opts["resources"] = ray_custom_resources
69
72
 
70
73
  task_opts["max_retries"] = 3
71
-
74
+ task_opts["num_cpus"] = 1
75
+ task_opts["resources"] = {"convert_task": 1}
72
76
  # List of possible botocore exceptions are available at
73
77
  # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
74
78
  task_opts["retry_exceptions"] = [RetryableError]
@@ -76,13 +80,38 @@ def _get_task_options(
76
80
  return task_opts
77
81
 
78
82
 
79
- def convert_resource_options_provider(index, files_for_each_bucket):
80
- (
81
- data_files_list,
82
- equality_delete_files_list,
83
- position_delete_files_list,
84
- ) = files_for_each_bucket[1]
85
- memory_requirement = estimate_convert_remote_option_resources(
86
- data_files_list, equality_delete_files_list
83
+ def estimate_dedupe_memory(all_data_files_for_dedupe):
84
+ dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
85
+ produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
86
+ ["file_path", "pos"], dedupe_record_count
87
+ )
88
+ download_pk_memory_required = estimate_fixed_hash_columns(
89
+ XXHASH_BYTE_PER_RECORD, dedupe_record_count
87
90
  )
88
- return _get_task_options(memory=memory_requirement)
91
+ memory_required_by_dedupe = (
92
+ produced_pos_memory_required + download_pk_memory_required
93
+ ) * PYARROW_AGGREGATE_MEMORY_MULTIPLIER
94
+ memory_with_buffer = memory_required_by_dedupe * MEMORY_BUFFER_RATE
95
+ return memory_with_buffer
96
+
97
+
98
+ def convert_resource_options_provider(index, convert_input_files):
99
+ applicable_data_files = convert_input_files.applicable_data_files
100
+ applicable_equality_delete_files = (
101
+ convert_input_files.applicable_equality_delete_files
102
+ )
103
+ all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
104
+ total_memory_required = 0
105
+ if applicable_data_files and applicable_equality_delete_files:
106
+ memory_requirement_for_convert_equality_deletes = (
107
+ estimate_convert_remote_option_resources(
108
+ applicable_data_files, applicable_equality_delete_files
109
+ )
110
+ )
111
+ total_memory_required += memory_requirement_for_convert_equality_deletes
112
+ if all_data_files_for_dedupe:
113
+ memory_requirement_for_dedupe = estimate_dedupe_memory(
114
+ all_data_files_for_dedupe
115
+ )
116
+ total_memory_required += memory_requirement_for_dedupe
117
+ return _get_task_options(memory=total_memory_required)
@@ -80,3 +80,8 @@ def append_global_record_idx_column(
80
80
  pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
81
81
  )
82
82
  return table
83
+
84
+
85
+ _IDENTIFIER_COLUMNS_HASH_COLUMN_NAME = _get_iceberg_col_name(
86
+ "identifier_columns_hashed"
87
+ )
@@ -1,26 +1,43 @@
1
+ import logging
2
+ from deltacat import logs
1
3
  import deltacat.compute.converter.utils.iceberg_columns as sc
2
4
  import daft
5
+ from deltacat.utils.daft import _get_s3_io_config
6
+ from daft import TimeUnit
7
+ import pyarrow as pa
8
+ from deltacat.utils.pyarrow import sliced_string_cast
9
+ from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
10
+
11
+ import pyarrow.compute as pc
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
3
14
 
4
15
 
5
16
  def download_data_table_and_append_iceberg_columns(
6
- file, columns_to_download, additional_columns_to_append, sequence_number
17
+ file,
18
+ columns_to_download,
19
+ additional_columns_to_append,
20
+ sequence_number,
21
+ s3_client_kwargs,
7
22
  ):
8
- # TODO; add S3 client kwargs
9
23
  table = download_parquet_with_daft_hash_applied(
10
- identify_columns=columns_to_download, file=file, s3_client_kwargs={}
24
+ identifier_columns=columns_to_download,
25
+ file=file,
26
+ s3_client_kwargs=s3_client_kwargs,
11
27
  )
28
+
12
29
  if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
13
30
  table = sc.append_file_path_column(table, file.file_path)
14
31
  if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
15
32
  record_idx_iterator = iter(range(len(table)))
16
33
  table = sc.append_record_idx_col(table, record_idx_iterator)
34
+
17
35
  return table
18
36
 
19
37
 
20
38
  def download_parquet_with_daft_hash_applied(
21
- identify_columns, file, s3_client_kwargs, **kwargs
39
+ identifier_columns, file, s3_client_kwargs, **kwargs
22
40
  ):
23
- from daft import TimeUnit
24
41
 
25
42
  # TODO: Add correct read kwargs as in:
26
43
  # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
@@ -29,15 +46,69 @@ def download_parquet_with_daft_hash_applied(
29
46
  kwargs.get("coerce_int96_timestamp_unit", "ms")
30
47
  )
31
48
 
32
- from deltacat.utils.daft import _get_s3_io_config
33
-
34
49
  # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
35
50
  io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
36
- df = daft.read_parquet(
51
+ df = daft_read_parquet(
37
52
  path=file.file_path,
38
53
  io_config=io_config,
39
54
  coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
40
55
  )
41
- df = df.select(daft.col(identify_columns[0]).hash())
42
- arrow_table = df.to_arrow()
43
- return arrow_table
56
+
57
+ hash_column = concatenate_hashed_identifier_columns(
58
+ df=df, identifier_columns=identifier_columns
59
+ )
60
+
61
+ table = pa.Table.from_arrays(
62
+ [hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
63
+ )
64
+
65
+ return table
66
+
67
+
68
+ def daft_read_parquet(path, io_config, coerce_int96_timestamp_unit):
69
+ df = daft.read_parquet(
70
+ path=path,
71
+ io_config=io_config,
72
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
73
+ )
74
+ return df
75
+
76
+
77
+ def concatenate_hashed_identifier_columns(df, identifier_columns):
78
+ pk_hash_columns = []
79
+ previous_hash_column_length = None
80
+ for i in range(len(identifier_columns)):
81
+ pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
82
+ pk_hash_column_arrow = pk_hash_column.to_arrow()
83
+
84
+ # Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
85
+ if not previous_hash_column_length:
86
+ previous_hash_column_length = len(pk_hash_column_arrow)
87
+ else:
88
+ assert previous_hash_column_length == len(pk_hash_column_arrow), (
89
+ f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
90
+ f"but expected {previous_hash_column_length}."
91
+ )
92
+ previous_hash_column_length = len(pk_hash_column_arrow)
93
+
94
+ # Convert identifier from different datatypes to string here
95
+ pk_hash_column_str = sliced_string_cast(
96
+ pk_hash_column_arrow[identifier_columns[i]]
97
+ )
98
+ assert len(pk_hash_column_str) == previous_hash_column_length, (
99
+ f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
100
+ f"before casting length: {previous_hash_column_length}."
101
+ )
102
+
103
+ pk_hash_columns.append(pk_hash_column_str)
104
+
105
+ pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
106
+ pk_hash_columns_concatenated = pc.binary_join_element_wise(
107
+ *pk_hash_columns, null_handling="replace"
108
+ )
109
+ assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
110
+ f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
111
+ f"before concatenating length: {previous_hash_column_length}."
112
+ )
113
+
114
+ return pk_hash_columns_concatenated
@@ -11,7 +11,7 @@ from deltacat.types.tables import (
11
11
  get_table_length,
12
12
  TABLE_CLASS_TO_SLICER_FUNC,
13
13
  )
14
- from typing import Optional, Dict, Any, List
14
+ from typing import Optional, Dict, Any
15
15
  from deltacat.exceptions import RetryableError
16
16
  from deltacat.storage import (
17
17
  DistributedDataset,
@@ -59,7 +59,7 @@ def upload_table_with_retry(
59
59
  max_records_per_file: Optional[int] = 4000000,
60
60
  s3_file_system=None,
61
61
  **s3_client_kwargs,
62
- ) -> List[str]:
62
+ ):
63
63
  """
64
64
  Writes the given table to 1 or more S3 files and return Redshift
65
65
  manifest entries describing the uploaded files.
@@ -77,7 +77,7 @@ def upload_table_with_retry(
77
77
  s3_file_system = get_s3_file_system(content_type=content_type)
78
78
  capture_object = CapturedBlockWritePaths()
79
79
  block_write_path_provider = UuidBlockWritePathProvider(
80
- capture_object=capture_object
80
+ capture_object=capture_object, base_path=s3_url_prefix
81
81
  )
82
82
  s3_table_writer_func = get_table_writer(table)
83
83
  table_record_count = get_table_length(table)
@@ -110,7 +110,16 @@ def upload_table_with_retry(
110
110
  )
111
111
  del block_write_path_provider
112
112
  write_paths = capture_object.write_paths()
113
- return write_paths
113
+ s3_write_paths = []
114
+ for path in write_paths:
115
+ s3_write_path = construct_s3_url(path)
116
+ s3_write_paths.append(s3_write_path)
117
+ return s3_write_paths
118
+
119
+
120
+ def construct_s3_url(path):
121
+ if path:
122
+ return f"s3://{path}"
114
123
 
115
124
 
116
125
  def upload_table(
File without changes
@@ -0,0 +1,404 @@
1
+ # from deltacat.compute import index
2
+ import subprocess
3
+ import socket
4
+ import os
5
+ import time
6
+ import re
7
+
8
+ import deltacat as dc
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from typing import Set, Optional, Dict, Any, Union
13
+
14
+ from ray.job_submission import JobSubmissionClient, JobStatus
15
+
16
+ from deltacat.utils.performance import timed_invocation
17
+
18
+
19
+ def _run_cmd(cmd: str) -> None:
20
+ exit_code = int(os.system(cmd))
21
+ assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code}"
22
+
23
+
24
+ def _ray_up(cluster_cfg: str, restart_only: bool = False) -> None:
25
+ restart_flag = "--no-restart" if not restart_only else "--restart-only"
26
+ print(f"Starting Ray cluster from '{cluster_cfg}'")
27
+ _run_cmd(
28
+ f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} --disable-usage-stats"
29
+ )
30
+ print(f"Started Ray cluster from '{cluster_cfg}'")
31
+
32
+
33
+ def _is_port_in_use(port: Union[int, str]) -> bool:
34
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
35
+ return s.connect_ex(("localhost", int(port))) == 0
36
+
37
+
38
+ def _is_dashboard_running(port: Union[int, str]) -> bool:
39
+ return _is_port_in_use(port)
40
+
41
+
42
+ def _ray_dashboard_up(
43
+ cluster_cfg: str, port: Union[str, int], timeout_seconds=15
44
+ ) -> None:
45
+ print(f"Starting Ray Dashboard for Ray cluster '{cluster_cfg}'")
46
+ _run_cmd(f"ray dashboard '{cluster_cfg}' --port {port} &")
47
+ start = time.monotonic()
48
+ dashboard_is_up = False
49
+ while time.monotonic() - start <= timeout_seconds:
50
+ if _is_dashboard_running(port):
51
+ dashboard_is_up = True
52
+ break
53
+ time.sleep(0.1)
54
+ if not dashboard_is_up:
55
+ raise TimeoutError(
56
+ f"Timed out after waiting {timeout_seconds} seconds for dashboard "
57
+ f"to establish connection on port {port}."
58
+ )
59
+ print(f"Started Ray Dashboard for Ray cluster '{cluster_cfg}'")
60
+
61
+
62
+ def _get_head_node_ip(cluster_cfg: str) -> str:
63
+ print(f"Getting Ray cluster head node IP for '{cluster_cfg}'")
64
+ cmd = f"ray get-head-ip '{cluster_cfg}'"
65
+ proc = subprocess.run(
66
+ cmd,
67
+ shell=True,
68
+ capture_output=True,
69
+ text=True,
70
+ check=True,
71
+ )
72
+ # the head node IP should be the last line printed to stdout
73
+ head_node_ip = proc.stdout.splitlines()[-1]
74
+ if not re.match(
75
+ r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
76
+ head_node_ip,
77
+ ):
78
+ print(
79
+ f"Failed to find Ray Head Node IP Address in `{cmd}` "
80
+ f"output: {proc.stdout}"
81
+ )
82
+ raise RuntimeError("No Ray Head Node IP Address Found")
83
+ print(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
84
+ return head_node_ip
85
+
86
+
87
+ def _ray_down_cmd(cluster_cfg: str) -> str:
88
+ return f"ray down '{cluster_cfg}' -y"
89
+
90
+
91
+ def _ray_down(cluster_cfg: str) -> None:
92
+ print(f"Destroying Ray cluster for '{cluster_cfg}'")
93
+ _run_cmd(_ray_down_cmd(cluster_cfg))
94
+ print(f"Destroyed Ray cluster for '{cluster_cfg}'")
95
+
96
+
97
+ def _ray_cluster_running(cluster_cfg: str) -> bool:
98
+ try:
99
+ _get_head_node_ip(cluster_cfg)
100
+ except Exception as e:
101
+ print(f"Get Head Node IP Failed with Exception: {e}")
102
+ print(f"Assuming Ray Cluster is Not Running")
103
+ return False
104
+ return True
105
+
106
+
107
+ @dataclass(frozen=True)
108
+ class DeltaCatJobRunResult:
109
+ job_id: str
110
+ job_status: JobStatus
111
+ job_logs: Any
112
+
113
+
114
+ class DeltaCatJobClient(JobSubmissionClient):
115
+ @staticmethod
116
+ def of(
117
+ cluster_cfg_file_path: str = "./deltacat.yaml",
118
+ *,
119
+ launch_cluster: bool = True,
120
+ start_dashboard: bool = True,
121
+ restart_ray: bool = False,
122
+ head_node_ip: str = None,
123
+ dashboard_wait_time_seconds: int = 30,
124
+ port: Union[int, str] = "8265",
125
+ ):
126
+ job_submission_client_url = None
127
+ try:
128
+ # launch Ray cluster if necessary
129
+ if cluster_cfg_file_path:
130
+ if launch_cluster:
131
+ if not _ray_cluster_running(cluster_cfg_file_path) or restart_ray:
132
+ _ray_up(cluster_cfg_file_path)
133
+ elif restart_ray:
134
+ if _ray_cluster_running(cluster_cfg_file_path):
135
+ _ray_up(cluster_cfg_file_path, restart_ray)
136
+ else:
137
+ raise RuntimeError(
138
+ f"Cannot Restart Ray: Ray Cluster for "
139
+ f"`{cluster_cfg_file_path}` not found."
140
+ )
141
+ dashboard_running = _is_dashboard_running(port)
142
+ if not dashboard_running and start_dashboard:
143
+ _ray_dashboard_up(
144
+ cluster_cfg=cluster_cfg_file_path,
145
+ port=port,
146
+ timeout_seconds=dashboard_wait_time_seconds,
147
+ )
148
+ dashboard_running = True
149
+ if not head_node_ip:
150
+ head_node_ip = (
151
+ "127.0.0.1"
152
+ # use dashboard port forwarding on localhost
153
+ if dashboard_running
154
+ # fetch the remote head node IP
155
+ else _get_head_node_ip(cluster_cfg_file_path)
156
+ )
157
+ else:
158
+ head_node_ip = "127.0.0.1"
159
+ job_submission_client_url = f"http://{head_node_ip}:{port}"
160
+ print(
161
+ f"Initializing Ray Job Submission Client with URL: "
162
+ f"{job_submission_client_url}"
163
+ )
164
+ client = JobSubmissionClient(f"http://{head_node_ip}:{port}")
165
+ # the below class change is safe as long as we only add new methods
166
+ # to the wrapped JobSubmissionClient that don't alter its internal
167
+ # state
168
+ client.__class__ = DeltaCatJobClient
169
+ return client
170
+ except Exception as e:
171
+ print(f"Unexpected error while initializing Ray Job Client: {e}")
172
+ if job_submission_client_url:
173
+ print(
174
+ f"Please ensure that Ray was installed with a job server "
175
+ f'enabled via `pip install -U "ray[default]"` and '
176
+ f"that http://{head_node_ip}:{port} is accessible. You "
177
+ f"can optionally run `ray dashboard` to forward the "
178
+ f"remote Ray head node port to a local port (default 8265) "
179
+ f'then run `ray_job_client("127.0.0.1", 8265)` '
180
+ f"to connect via localhost."
181
+ )
182
+ if cluster_cfg_file_path:
183
+ print(
184
+ f"If you're done submitting jobs, ensure that the remote "
185
+ f"Ray Cluster is shut down by running: "
186
+ f"{_ray_down_cmd(cluster_cfg_file_path)}"
187
+ )
188
+ raise e
189
+
190
+ def run_job(
191
+ self,
192
+ *,
193
+ entrypoint: str,
194
+ runtime_env: Optional[Dict[str, Any]] = None,
195
+ timeout_seconds: int = 600,
196
+ **kwargs,
197
+ ) -> DeltaCatJobRunResult:
198
+ """
199
+ Synchronously submit and run a Ray job. This method combines Ray job submission and monitoring by submitting
200
+ the job to the Ray Job Server, waiting for the job to complete,
201
+ validating the job's terminal status, retrieving and returning job run
202
+ result information if successful.
203
+
204
+ Args:
205
+ entrypoint: The entry point for the job to be executed (module
206
+ or script to run)
207
+ runtime_env: Runtime environment configuration for the job.
208
+ Some commonly used keys include `working_dir` (directory
209
+ containing the job code), `pip` (list of pip packages to
210
+ install), and `env_vars` (environment variables for the job).
211
+ timeout_seconds: Maximum time in seconds to wait for job completion.
212
+ Default to 600 seconds (10 minutes).
213
+ kwargs: Additional keyword arguments to pass to the job submission.
214
+
215
+ Returns:
216
+ Final results from the successful job run execution.
217
+
218
+ Raises:
219
+ RuntimeError: If the job fails or terminates with status other
220
+ than SUCCEEDED.
221
+ TimeoutError: If the job doesn't complete within the specified
222
+ timeout period
223
+
224
+ Example:
225
+ >>> client = job_client()
226
+ >>> logs = client.run_job(
227
+ ... # Shell command to run job
228
+ ... entrypoint="my_script.py",
229
+ ... runtime_env={
230
+ ... # Path to the local directory containing my_script.py
231
+ ... "working_dir": "./",
232
+ ... # Pip dependencies to install
233
+ ... "pip": ["pandas", "numpy"],
234
+ ... # System environment variables to set
235
+ ... "env_vars": {"DATA_PATH": "/path/to/data"},
236
+ ... },
237
+ ... timeout_seconds=1200
238
+ ... )
239
+ """
240
+
241
+ job_id = self.submit_job(
242
+ entrypoint=entrypoint,
243
+ runtime_env=runtime_env,
244
+ **kwargs,
245
+ )
246
+ job_status, latency = timed_invocation(
247
+ self.await_job,
248
+ job_id,
249
+ timeout_seconds=timeout_seconds,
250
+ )
251
+ job_logs = self.get_job_logs(job_id)
252
+ if job_status != JobStatus.SUCCEEDED:
253
+ print(f"Job `{job_id}` logs: ")
254
+ print(job_logs)
255
+ raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
256
+ return DeltaCatJobRunResult(
257
+ job_id=job_id,
258
+ job_status=job_status,
259
+ job_logs=job_logs,
260
+ )
261
+
262
+ def await_job(
263
+ self,
264
+ job_id: str,
265
+ await_status: Set[JobStatus] = {
266
+ JobStatus.SUCCEEDED,
267
+ JobStatus.STOPPED,
268
+ JobStatus.FAILED,
269
+ },
270
+ *,
271
+ timeout_seconds: int = 600,
272
+ ) -> JobStatus:
273
+ """
274
+ Polls a job's status until it matches the desired status or times out.
275
+
276
+ This function continuously checks the status of a specified job using the
277
+ provided client. It will keep polling until either the desired status is
278
+ reached or the timeout period expires.
279
+
280
+ Args:
281
+ job_id: The unique identifier of the job to monitor.
282
+ await_status: Set of :class:`ray.job_submission.JobStatus` to wait for.
283
+ The function will return when the job reaches any of these states.
284
+ timeout_seconds: Maximum time to wait in seconds.
285
+ Defaults to 600 seconds (10 minutes).
286
+
287
+ Returns:
288
+ The final status of the job.
289
+
290
+ Raises:
291
+ TimeoutError: If the desired status is not reached within the
292
+ specified timeout period.
293
+
294
+ Example:
295
+ >>>
296
+ >>> client = job_client()
297
+ >>> job_id = client.submit_job(
298
+ >>> # Shell command to run job
299
+ >>> entrypoint=f"python copy.py --source '{source}' --dest '{dest}'",
300
+ >>> # Path to the local directory containing copy.py
301
+ >>> runtime_env={"working_dir": "./"},
302
+ >>> )
303
+ >>> # wait for the job to reach a terminal state
304
+ >>> client.await_job(job_id)
305
+ """
306
+ start = time.monotonic()
307
+ terminal_status = None
308
+ while time.monotonic() - start <= timeout_seconds:
309
+ status = self.get_job_status(job_id)
310
+ if status in await_status:
311
+ terminal_status = status
312
+ break
313
+ time.sleep(0.1)
314
+ if not terminal_status:
315
+ self.stop_job(job_id)
316
+ raise TimeoutError(
317
+ f"Timed out after waiting {timeout_seconds} seconds for job "
318
+ f"`{job_id}` status: {status}"
319
+ )
320
+ return terminal_status
321
+
322
+
323
+ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
324
+ """
325
+ Create a Ray Job Client that can be used to submit jobs to a local Ray
326
+ cluster. Initializes Ray if it's not already running.
327
+
328
+ Args:
329
+ *args: Positional arguments to pass to `ray.init()`.
330
+ **kwargs: Keyword arguments to pass to `ray.init()`.
331
+ Returns:
332
+ DeltaCatJobClient: A client instance that can be used to submit and
333
+ manage local Ray jobs.
334
+
335
+ Raises:
336
+ RuntimeError: If a local Ray Job Server cannot be found.
337
+ """
338
+ if not dc.is_initialized():
339
+ context = dc.init(*args, **kwargs)
340
+ else:
341
+ context = dc.init(ray_init_args={"ignore_reinit_error": True})
342
+ if context.dashboard_url:
343
+ head_node_ip, port = context.dashboard_url.split(":")
344
+ else:
345
+ # the Ray Dashboard URL is also the Ray Job Server URL
346
+ raise RuntimeError(
347
+ "Ray Job Server not found! Please reinstall Ray using "
348
+ "`pip install -U `ray[default]`"
349
+ )
350
+ return DeltaCatJobClient.of(
351
+ None,
352
+ launch_cluster=False,
353
+ start_dashboard=False,
354
+ head_node_ip=head_node_ip,
355
+ port=port,
356
+ )
357
+
358
+
359
+ def job_client(
360
+ cluster_cfg_file_path: str = "./deltacat.yaml",
361
+ *,
362
+ launch_cluster: bool = True,
363
+ start_dashboard: bool = True,
364
+ restart_ray: bool = False,
365
+ head_node_ip: str = None,
366
+ dashboard_wait_time_seconds: int = 15,
367
+ port: Union[str, int] = "8265",
368
+ ) -> DeltaCatJobClient:
369
+ """
370
+ Create a DeltaCAT Job Client that can be used to submit jobs to a remote Ray cluster.
371
+
372
+ Args:
373
+ cluster_cfg_file_path: Path to the Ray Cluster Launcher
374
+ Config file. Defaults to "./deltacat.yaml".
375
+ launch_cluster : Whether to launch a new Ray cluster.
376
+ Defaults to True.
377
+ start_dashboard: Whether to start the Ray dashboard.
378
+ Defaults to True.
379
+ restart_ray: Whether to restart Ray if it's already
380
+ running. Defaults to False.
381
+ head_node_ip: IP address of the Ray cluster head node.
382
+ If None, will use the configuration from the cluster config file.
383
+ Defaults to None.
384
+ dashboard_wait_time_seconds: Time in seconds to wait for the Ray
385
+ dashboard to start if `start_dashboard` is True.
386
+ port: Port number for the Ray
387
+ dashboard/job server. Defaults to "8265".
388
+
389
+ Returns:
390
+ DeltaCatJobClient: A client instance that can be used to submit and
391
+ manage jobs on the Ray cluster.
392
+
393
+ Raises:
394
+ RuntimeError: If the Ray Job Server is not found.
395
+ """
396
+ return DeltaCatJobClient.of(
397
+ cluster_cfg_file_path,
398
+ launch_cluster=launch_cluster,
399
+ start_dashboard=start_dashboard,
400
+ restart_ray=restart_ray,
401
+ head_node_ip=head_node_ip,
402
+ dashboard_wait_time_seconds=dashboard_wait_time_seconds,
403
+ port=port,
404
+ )
deltacat/constants.py CHANGED
@@ -44,10 +44,9 @@ DELTACAT_ROOT = env_string(
44
44
  )
45
45
 
46
46
  # CLI Args
47
- METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
48
47
  METAFILE_FORMAT_JSON = "json"
49
48
  METAFILE_FORMAT_MSGPACK = "msgpack"
50
- METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
49
+ METAFILE_FORMAT = env_string("METAFILE_FORMAT", METAFILE_FORMAT_MSGPACK)
51
50
  SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
52
51
  METAFILE_EXT = {
53
52
  "json": ".json",
@@ -95,11 +94,12 @@ RUNNING_TXN_DIR_NAME: str = "running"
95
94
  FAILED_TXN_DIR_NAME: str = "failed"
96
95
  SUCCESS_TXN_DIR_NAME: str = "success"
97
96
  TXN_PART_SEPARATOR = "_"
97
+
98
98
  # Storage interface defaults
99
99
  # These defaults should be applied in catalog interface implementations
100
100
  # Storage interface implementations should be agnostic to defaults and require full information
101
- DEFAULT_CATALOG = "DEFAULT"
102
- DEFAULT_NAMESPACE = "DEFAULT"
101
+ DEFAULT_CATALOG = "default"
102
+ DEFAULT_NAMESPACE = "default"
103
103
  DEFAULT_TABLE_VERSION = "1"
104
104
  DEFAULT_STREAM_ID = "stream"
105
105
  DEFAULT_PARTITION_ID = "partition"