deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,43 @@
1
+ import logging
2
+ from deltacat import logs
1
3
  import deltacat.compute.converter.utils.iceberg_columns as sc
2
4
  import daft
5
+ from deltacat.utils.daft import _get_s3_io_config
6
+ from daft import TimeUnit
7
+ import pyarrow as pa
8
+ from deltacat.utils.pyarrow import sliced_string_cast
9
+ from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
10
+
11
+ import pyarrow.compute as pc
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
3
14
 
4
15
 
5
16
  def download_data_table_and_append_iceberg_columns(
6
- file, columns_to_download, additional_columns_to_append, sequence_number
17
+ file,
18
+ columns_to_download,
19
+ additional_columns_to_append,
20
+ sequence_number,
21
+ s3_client_kwargs,
7
22
  ):
8
- # TODO; add S3 client kwargs
9
23
  table = download_parquet_with_daft_hash_applied(
10
- identify_columns=columns_to_download, file=file, s3_client_kwargs={}
24
+ identifier_columns=columns_to_download,
25
+ file=file,
26
+ s3_client_kwargs=s3_client_kwargs,
11
27
  )
28
+
12
29
  if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
13
30
  table = sc.append_file_path_column(table, file.file_path)
14
31
  if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
15
32
  record_idx_iterator = iter(range(len(table)))
16
33
  table = sc.append_record_idx_col(table, record_idx_iterator)
34
+
17
35
  return table
18
36
 
19
37
 
20
38
  def download_parquet_with_daft_hash_applied(
21
- identify_columns, file, s3_client_kwargs, **kwargs
39
+ identifier_columns, file, s3_client_kwargs, **kwargs
22
40
  ):
23
- from daft import TimeUnit
24
41
 
25
42
  # TODO: Add correct read kwargs as in:
26
43
  # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
@@ -29,15 +46,69 @@ def download_parquet_with_daft_hash_applied(
29
46
  kwargs.get("coerce_int96_timestamp_unit", "ms")
30
47
  )
31
48
 
32
- from deltacat.utils.daft import _get_s3_io_config
33
-
34
49
  # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
35
50
  io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
36
- df = daft.read_parquet(
51
+ df = daft_read_parquet(
37
52
  path=file.file_path,
38
53
  io_config=io_config,
39
54
  coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
40
55
  )
41
- df = df.select(daft.col(identify_columns[0]).hash())
42
- arrow_table = df.to_arrow()
43
- return arrow_table
56
+
57
+ hash_column = concatenate_hashed_identifier_columns(
58
+ df=df, identifier_columns=identifier_columns
59
+ )
60
+
61
+ table = pa.Table.from_arrays(
62
+ [hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
63
+ )
64
+
65
+ return table
66
+
67
+
68
+ def daft_read_parquet(path, io_config, coerce_int96_timestamp_unit):
69
+ df = daft.read_parquet(
70
+ path=path,
71
+ io_config=io_config,
72
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
73
+ )
74
+ return df
75
+
76
+
77
+ def concatenate_hashed_identifier_columns(df, identifier_columns):
78
+ pk_hash_columns = []
79
+ previous_hash_column_length = None
80
+ for i in range(len(identifier_columns)):
81
+ pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
82
+ pk_hash_column_arrow = pk_hash_column.to_arrow()
83
+
84
+ # Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
85
+ if not previous_hash_column_length:
86
+ previous_hash_column_length = len(pk_hash_column_arrow)
87
+ else:
88
+ assert previous_hash_column_length == len(pk_hash_column_arrow), (
89
+ f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
90
+ f"but expected {previous_hash_column_length}."
91
+ )
92
+ previous_hash_column_length = len(pk_hash_column_arrow)
93
+
94
+ # Convert identifier from different datatypes to string here
95
+ pk_hash_column_str = sliced_string_cast(
96
+ pk_hash_column_arrow[identifier_columns[i]]
97
+ )
98
+ assert len(pk_hash_column_str) == previous_hash_column_length, (
99
+ f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
100
+ f"before casting length: {previous_hash_column_length}."
101
+ )
102
+
103
+ pk_hash_columns.append(pk_hash_column_str)
104
+
105
+ pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
106
+ pk_hash_columns_concatenated = pc.binary_join_element_wise(
107
+ *pk_hash_columns, null_handling="replace"
108
+ )
109
+ assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
110
+ f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
111
+ f"before concatenating length: {previous_hash_column_length}."
112
+ )
113
+
114
+ return pk_hash_columns_concatenated
@@ -11,7 +11,7 @@ from deltacat.types.tables import (
11
11
  get_table_length,
12
12
  TABLE_CLASS_TO_SLICER_FUNC,
13
13
  )
14
- from typing import Optional, Dict, Any, List
14
+ from typing import Optional, Dict, Any
15
15
  from deltacat.exceptions import RetryableError
16
16
  from deltacat.storage import (
17
17
  DistributedDataset,
@@ -59,7 +59,7 @@ def upload_table_with_retry(
59
59
  max_records_per_file: Optional[int] = 4000000,
60
60
  s3_file_system=None,
61
61
  **s3_client_kwargs,
62
- ) -> List[str]:
62
+ ):
63
63
  """
64
64
  Writes the given table to 1 or more S3 files and return Redshift
65
65
  manifest entries describing the uploaded files.
@@ -77,7 +77,7 @@ def upload_table_with_retry(
77
77
  s3_file_system = get_s3_file_system(content_type=content_type)
78
78
  capture_object = CapturedBlockWritePaths()
79
79
  block_write_path_provider = UuidBlockWritePathProvider(
80
- capture_object=capture_object
80
+ capture_object=capture_object, base_path=s3_url_prefix
81
81
  )
82
82
  s3_table_writer_func = get_table_writer(table)
83
83
  table_record_count = get_table_length(table)
@@ -110,7 +110,16 @@ def upload_table_with_retry(
110
110
  )
111
111
  del block_write_path_provider
112
112
  write_paths = capture_object.write_paths()
113
- return write_paths
113
+ s3_write_paths = []
114
+ for path in write_paths:
115
+ s3_write_path = construct_s3_url(path)
116
+ s3_write_paths.append(s3_write_path)
117
+ return s3_write_paths
118
+
119
+
120
+ def construct_s3_url(path):
121
+ if path:
122
+ return f"s3://{path}"
114
123
 
115
124
 
116
125
  def upload_table(
@@ -0,0 +1,406 @@
1
+ # from deltacat.compute import index
2
+ import subprocess
3
+ import socket
4
+ import os
5
+ import time
6
+ import re
7
+
8
+ import deltacat as dc
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from typing import Set, Optional, Dict, Any, Union
13
+
14
+ from ray.job_submission import JobSubmissionClient, JobStatus
15
+
16
+ from deltacat.utils.performance import timed_invocation
17
+
18
+
19
+ def _run_cmd(cmd: str) -> None:
20
+ exit_code = int(os.system(cmd))
21
+ assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code}"
22
+
23
+
24
+ def _ray_up(cluster_cfg: str, restart_only: bool = False) -> None:
25
+ restart_flag = "--no-restart" if not restart_only else "--restart-only"
26
+ print(f"Starting Ray cluster from '{cluster_cfg}'")
27
+ _run_cmd(
28
+ f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} --disable-usage-stats"
29
+ )
30
+ print(f"Started Ray cluster from '{cluster_cfg}'")
31
+
32
+
33
+ def _is_port_in_use(port: Union[int, str]) -> bool:
34
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
35
+ return s.connect_ex(("localhost", int(port))) == 0
36
+
37
+
38
+ def _is_dashboard_running(port: Union[int, str]) -> bool:
39
+ return _is_port_in_use(port)
40
+
41
+
42
+ def _ray_dashboard_up(
43
+ cluster_cfg: str, port: Union[str, int], timeout_seconds=15
44
+ ) -> None:
45
+ print(f"Starting Ray Dashboard for Ray cluster '{cluster_cfg}'")
46
+ _run_cmd(f"ray dashboard '{cluster_cfg}' --port {port} &")
47
+ start = time.monotonic()
48
+ dashboard_is_up = False
49
+ while time.monotonic() - start <= timeout_seconds:
50
+ if _is_dashboard_running(port):
51
+ dashboard_is_up = True
52
+ break
53
+ time.sleep(0.1)
54
+ if not dashboard_is_up:
55
+ raise TimeoutError(
56
+ f"Timed out after waiting {timeout_seconds} seconds for dashboard "
57
+ f"to establish connection on port {port}."
58
+ )
59
+ print(f"Started Ray Dashboard for Ray cluster '{cluster_cfg}'")
60
+
61
+
62
+ def _get_head_node_ip(cluster_cfg: str) -> str:
63
+ print(f"Getting Ray cluster head node IP for '{cluster_cfg}'")
64
+ cmd = f"ray get-head-ip '{cluster_cfg}'"
65
+ proc = subprocess.run(
66
+ cmd,
67
+ shell=True,
68
+ capture_output=True,
69
+ text=True,
70
+ check=True,
71
+ )
72
+ # the head node IP should be the last line printed to stdout
73
+ # TODO(pdames): add IPv6 support
74
+ head_node_ip = proc.stdout.splitlines()[-1]
75
+ if not re.match(
76
+ r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",
77
+ head_node_ip,
78
+ ):
79
+ print(
80
+ f"Failed to find Ray Head Node IP Address in `{cmd}` "
81
+ f"output: {proc.stdout}"
82
+ )
83
+ raise RuntimeError("No Ray Head Node IP Address Found")
84
+ print(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
85
+ return head_node_ip
86
+
87
+
88
+ def _ray_down_cmd(cluster_cfg: str) -> str:
89
+ return f"ray down '{cluster_cfg}' -y"
90
+
91
+
92
+ def _ray_down(cluster_cfg: str) -> None:
93
+ print(f"Destroying Ray cluster for '{cluster_cfg}'")
94
+ _run_cmd(_ray_down_cmd(cluster_cfg))
95
+ print(f"Destroyed Ray cluster for '{cluster_cfg}'")
96
+
97
+
98
+ def _ray_cluster_running(cluster_cfg: str) -> bool:
99
+ try:
100
+ _get_head_node_ip(cluster_cfg)
101
+ except Exception as e:
102
+ print(f"Get Head Node IP Failed with Exception: {e}")
103
+ print(f"Assuming Ray Cluster is Not Running")
104
+ return False
105
+ return True
106
+
107
+
108
+ @dataclass(frozen=True)
109
+ class DeltaCatJobRunResult:
110
+ job_id: str
111
+ job_status: JobStatus
112
+ job_logs: Any
113
+
114
+
115
+ class DeltaCatJobClient(JobSubmissionClient):
116
+ @staticmethod
117
+ def of(
118
+ cluster_cfg_file_path: str = "./deltacat.yaml",
119
+ *,
120
+ launch_cluster: bool = True,
121
+ start_dashboard: bool = True,
122
+ restart_ray: bool = False,
123
+ head_node_ip: str = None,
124
+ dashboard_wait_time_seconds: int = 30,
125
+ port: Union[int, str] = "8265",
126
+ ):
127
+ job_submission_client_url = None
128
+ try:
129
+ # launch Ray cluster if necessary
130
+ if cluster_cfg_file_path:
131
+ if launch_cluster:
132
+ if not _ray_cluster_running(cluster_cfg_file_path) or restart_ray:
133
+ _ray_up(cluster_cfg_file_path)
134
+ elif restart_ray:
135
+ if _ray_cluster_running(cluster_cfg_file_path):
136
+ _ray_up(cluster_cfg_file_path, restart_ray)
137
+ else:
138
+ raise RuntimeError(
139
+ f"Cannot Restart Ray: Ray Cluster for "
140
+ f"`{cluster_cfg_file_path}` not found."
141
+ )
142
+ dashboard_running = _is_dashboard_running(port)
143
+ if not dashboard_running and start_dashboard:
144
+ _ray_dashboard_up(
145
+ cluster_cfg=cluster_cfg_file_path,
146
+ port=port,
147
+ timeout_seconds=dashboard_wait_time_seconds,
148
+ )
149
+ dashboard_running = True
150
+ if not head_node_ip:
151
+ head_node_ip = (
152
+ "127.0.0.1"
153
+ # use dashboard port forwarding on localhost
154
+ if dashboard_running
155
+ # fetch the remote head node IP
156
+ else _get_head_node_ip(cluster_cfg_file_path)
157
+ )
158
+ else:
159
+ head_node_ip = "127.0.0.1"
160
+ job_submission_client_url = f"http://{head_node_ip}:{port}"
161
+ print(
162
+ f"Initializing Ray Job Submission Client with URL: "
163
+ f"{job_submission_client_url}"
164
+ )
165
+ client = JobSubmissionClient(f"http://{head_node_ip}:{port}")
166
+ # the below class change is safe as long as we only add new methods
167
+ # to the wrapped JobSubmissionClient that don't alter its internal
168
+ # state
169
+ client.__class__ = DeltaCatJobClient
170
+ return client
171
+ except Exception as e:
172
+ print(f"Unexpected error while initializing Ray Job Client: {e}")
173
+ if job_submission_client_url:
174
+ print(
175
+ f"Please ensure that Ray was installed with a job server "
176
+ f'enabled via `pip install -U "ray[default]"` and '
177
+ f"that http://{head_node_ip}:{port} is accessible. You "
178
+ f"can optionally run `ray dashboard` to forward the "
179
+ f"remote Ray head node port to a local port (default 8265) "
180
+ f'then run `ray_job_client("127.0.0.1", 8265)` '
181
+ f"to connect via localhost."
182
+ )
183
+ if cluster_cfg_file_path:
184
+ print(
185
+ f"If you're done submitting jobs, ensure that the remote "
186
+ f"Ray Cluster is shut down by running: "
187
+ f"{_ray_down_cmd(cluster_cfg_file_path)}"
188
+ )
189
+ raise e
190
+
191
+ def run_job(
192
+ self,
193
+ *,
194
+ entrypoint: str,
195
+ runtime_env: Optional[Dict[str, Any]] = None,
196
+ timeout_seconds: int = 600,
197
+ **kwargs,
198
+ ) -> DeltaCatJobRunResult:
199
+ """
200
+ Synchronously submit and run a Ray job. This method combines Ray job submission and monitoring by submitting
201
+ the job to the Ray Job Server, waiting for the job to complete,
202
+ validating the job's terminal status, retrieving and returning job run
203
+ result information if successful.
204
+
205
+ Args:
206
+ entrypoint: The entry point for the job to be executed (module
207
+ or script to run)
208
+ runtime_env: Runtime environment configuration for the job.
209
+ Some commonly used keys include `working_dir` (directory
210
+ containing the job code), `pip` (list of pip packages to
211
+ install), and `env_vars` (environment variables for the job).
212
+ timeout_seconds: Maximum time in seconds to wait for job completion.
213
+ Default to 600 seconds (10 minutes).
214
+ kwargs: Additional keyword arguments to pass to the job submission.
215
+
216
+ Returns:
217
+ Final results from the successful job run execution.
218
+
219
+ Raises:
220
+ RuntimeError: If the job fails or terminates with status other
221
+ than SUCCEEDED.
222
+ TimeoutError: If the job doesn't complete within the specified
223
+ timeout period
224
+
225
+ Example:
226
+ >>> client = job_client()
227
+ >>> logs = client.run_job(
228
+ ... # Shell command to run job
229
+ ... entrypoint="my_script.py",
230
+ ... runtime_env={
231
+ ... # Path to the local directory containing my_script.py
232
+ ... "working_dir": "./",
233
+ ... # Pip dependencies to install
234
+ ... "pip": ["pandas", "numpy"],
235
+ ... # System environment variables to set
236
+ ... "env_vars": {"DATA_PATH": "/path/to/data"},
237
+ ... },
238
+ ... timeout_seconds=1200
239
+ ... )
240
+ """
241
+
242
+ job_id = self.submit_job(
243
+ entrypoint=entrypoint,
244
+ runtime_env=runtime_env,
245
+ **kwargs,
246
+ )
247
+ job_status, latency = timed_invocation(
248
+ self.await_job,
249
+ job_id,
250
+ timeout_seconds=timeout_seconds,
251
+ )
252
+ job_logs = self.get_job_logs(job_id)
253
+ if job_status != JobStatus.SUCCEEDED:
254
+ print(f"Job `{job_id}` logs: ")
255
+ print(job_logs)
256
+ raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
257
+ return DeltaCatJobRunResult(
258
+ job_id=job_id,
259
+ job_status=job_status,
260
+ job_logs=job_logs,
261
+ )
262
+
263
+ def await_job(
264
+ self,
265
+ job_id: str,
266
+ await_status: Set[JobStatus] = {
267
+ JobStatus.SUCCEEDED,
268
+ JobStatus.STOPPED,
269
+ JobStatus.FAILED,
270
+ },
271
+ *,
272
+ timeout_seconds: int = 600,
273
+ ) -> JobStatus:
274
+ """
275
+ Polls a job's status until it matches the desired status or times out.
276
+
277
+ This function continuously checks the status of a specified job using the
278
+ provided client. It will keep polling until either the desired status is
279
+ reached or the timeout period expires.
280
+
281
+ Args:
282
+ job_id: The unique identifier of the job to monitor.
283
+ await_status: Set of :class:`ray.job_submission.JobStatus` to wait for.
284
+ The function will return when the job reaches any of these states.
285
+ timeout_seconds: Maximum time to wait in seconds.
286
+ Defaults to 600 seconds (10 minutes).
287
+
288
+ Returns:
289
+ The final status of the job.
290
+
291
+ Raises:
292
+ TimeoutError: If the desired status is not reached within the
293
+ specified timeout period.
294
+
295
+ Example:
296
+ >>>
297
+ >>> client = job_client()
298
+ >>> job_id = client.submit_job(
299
+ >>> # Shell command to run job
300
+ >>> entrypoint=f"python copy.py --source '{source}' --dest '{dest}'",
301
+ >>> # Path to the local directory containing copy.py
302
+ >>> runtime_env={"working_dir": "./"},
303
+ >>> )
304
+ >>> # wait for the job to reach a terminal state
305
+ >>> client.await_job(job_id)
306
+ """
307
+ start = time.monotonic()
308
+ terminal_status = None
309
+ while time.monotonic() - start <= timeout_seconds:
310
+ status = self.get_job_status(job_id)
311
+ if status in await_status:
312
+ terminal_status = status
313
+ break
314
+ time.sleep(0.1)
315
+ if not terminal_status:
316
+ self.stop_job(job_id)
317
+ raise TimeoutError(
318
+ f"Timed out after waiting {timeout_seconds} seconds for job "
319
+ f"`{job_id}` status: {status}"
320
+ )
321
+ return terminal_status
322
+
323
+
324
+ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
325
+ """
326
+ Create a DeltaCAT Job Client that can be used to submit jobs to a local Ray
327
+ cluster. Initializes Ray if it's not already running.
328
+
329
+ Args:
330
+ *args: Positional arguments to pass to `deltacat.init()`.
331
+ **kwargs: Keyword arguments to pass to `deltacat.init()`.
332
+ Returns:
333
+ DeltaCatJobClient: A client instance that can be used to submit and
334
+ manage local Ray jobs.
335
+
336
+ Raises:
337
+ RuntimeError: If a local Ray Job Server cannot be found.
338
+ """
339
+ if not dc.is_initialized():
340
+ context = dc.init(*args, **kwargs)
341
+ else:
342
+ context = dc.init()
343
+ if context.dashboard_url:
344
+ head_node_ip, port = context.dashboard_url.split(":")
345
+ else:
346
+ # the Ray Dashboard URL is also the Ray Job Server URL
347
+ raise RuntimeError(
348
+ "Ray Job Server not found! Please reinstall Ray using "
349
+ "`pip install -U `ray[default]`"
350
+ )
351
+ return DeltaCatJobClient.of(
352
+ None,
353
+ launch_cluster=False,
354
+ start_dashboard=False,
355
+ head_node_ip=head_node_ip,
356
+ port=port,
357
+ )
358
+
359
+
360
+ def job_client(
361
+ cluster_cfg_file_path: str = "./deltacat.yaml",
362
+ *,
363
+ launch_cluster: bool = True,
364
+ start_dashboard: bool = True,
365
+ restart_ray: bool = False,
366
+ head_node_ip: str = None,
367
+ dashboard_wait_time_seconds: int = 15,
368
+ port: Union[str, int] = "8265",
369
+ ) -> DeltaCatJobClient:
370
+ """
371
+ Create a DeltaCAT Job Client that can be used to submit jobs to a remote
372
+ Ray cluster.
373
+
374
+ Args:
375
+ cluster_cfg_file_path: Path to the Ray Cluster Launcher
376
+ Config file. Defaults to "./deltacat.yaml".
377
+ launch_cluster : Whether to launch a new Ray cluster.
378
+ Defaults to True.
379
+ start_dashboard: Whether to start the Ray dashboard.
380
+ Defaults to True.
381
+ restart_ray: Whether to restart Ray if it's already
382
+ running. Defaults to False.
383
+ head_node_ip: IP address of the Ray cluster head node.
384
+ If None, will use the configuration from the cluster config file.
385
+ Defaults to None.
386
+ dashboard_wait_time_seconds: Time in seconds to wait for the Ray
387
+ dashboard to start if `start_dashboard` is True.
388
+ port: Port number for the Ray
389
+ dashboard/job server. Defaults to "8265".
390
+
391
+ Returns:
392
+ DeltaCatJobClient: A client instance that can be used to submit and
393
+ manage jobs on the Ray cluster.
394
+
395
+ Raises:
396
+ RuntimeError: If the Ray Job Server is not found.
397
+ """
398
+ return DeltaCatJobClient.of(
399
+ cluster_cfg_file_path,
400
+ launch_cluster=launch_cluster,
401
+ start_dashboard=start_dashboard,
402
+ restart_ray=restart_ray,
403
+ head_node_ip=head_node_ip,
404
+ dashboard_wait_time_seconds=dashboard_wait_time_seconds,
405
+ port=port,
406
+ )
deltacat/constants.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
 
4
4
  from deltacat.utils.common import env_string, env_bool
5
- import os
6
5
 
7
6
  # Environment variables
8
7
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -40,14 +39,13 @@ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
40
39
  )
41
40
  DELTACAT_ROOT = env_string(
42
41
  "DELTACAT_ROOT",
43
- os.path.join(os.getcwd(), ".deltacat"),
42
+ "",
44
43
  )
45
44
 
46
45
  # CLI Args
47
- METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
48
46
  METAFILE_FORMAT_JSON = "json"
49
47
  METAFILE_FORMAT_MSGPACK = "msgpack"
50
- METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
48
+ METAFILE_FORMAT = env_string("METAFILE_FORMAT", METAFILE_FORMAT_MSGPACK)
51
49
  SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
52
50
  METAFILE_EXT = {
53
51
  "json": ".json",
@@ -95,11 +93,12 @@ RUNNING_TXN_DIR_NAME: str = "running"
95
93
  FAILED_TXN_DIR_NAME: str = "failed"
96
94
  SUCCESS_TXN_DIR_NAME: str = "success"
97
95
  TXN_PART_SEPARATOR = "_"
96
+
98
97
  # Storage interface defaults
99
98
  # These defaults should be applied in catalog interface implementations
100
99
  # Storage interface implementations should be agnostic to defaults and require full information
101
- DEFAULT_CATALOG = "DEFAULT"
102
- DEFAULT_NAMESPACE = "DEFAULT"
100
+ DEFAULT_CATALOG = "default"
101
+ DEFAULT_NAMESPACE = "default"
103
102
  DEFAULT_TABLE_VERSION = "1"
104
103
  DEFAULT_STREAM_ID = "stream"
105
104
  DEFAULT_PARTITION_ID = "partition"
deltacat/env.py CHANGED
@@ -1,3 +1,4 @@
1
+ import argparse
1
2
  import os
2
3
  import logging
3
4
  from typing import Dict, Any
@@ -49,3 +50,12 @@ def create_ray_runtime_environment() -> Dict[str, Any]:
49
50
  "env_vars": worker_env_vars,
50
51
  }
51
52
  return runtime_environment
53
+
54
+
55
+ def store_cli_args_in_os_environ(script_args_list=[]):
56
+ parser = argparse.ArgumentParser()
57
+ for args, kwargs in script_args_list:
58
+ parser.add_argument(*args, **kwargs)
59
+ args = parser.parse_args()
60
+ print(f"Command Line Arguments: {args}")
61
+ os.environ.update(vars(args))
@@ -1,12 +1,12 @@
1
1
  import os
2
- import ray
2
+ import deltacat
3
3
  import logging
4
4
 
5
+ import ray
6
+
5
7
  from deltacat import logs
6
8
  from deltacat.constants import DELTACAT_APP_LOG_DIR, DELTACAT_SYS_LOG_DIR
7
- from deltacat.examples.common.fixtures import (
8
- store_cli_args_in_os_environ,
9
- )
9
+ from env import store_cli_args_in_os_environ
10
10
  from deltacat.env import create_ray_runtime_environment
11
11
 
12
12
  # initialize the driver logger
@@ -94,8 +94,8 @@ if __name__ == "__main__":
94
94
  # create any runtime environment required to run the example
95
95
  runtime_env = create_ray_runtime_environment()
96
96
 
97
- # initialize ray
98
- ray.init(runtime_env=runtime_env)
97
+ # initialize deltacat
98
+ deltacat.init(ray_init_args={"runtime_env": runtime_env})
99
99
 
100
100
  # run the example using os.environ as kwargs
101
101
  run(**os.environ)
@@ -9,10 +9,8 @@ import deltacat as dc
9
9
 
10
10
  from deltacat import logs
11
11
  from deltacat import IcebergCatalog
12
- from deltacat.catalog.iceberg import IcebergCatalogConfig
13
- from deltacat.examples.common.fixtures import (
14
- store_cli_args_in_os_environ,
15
- )
12
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
13
+ from env import store_cli_args_in_os_environ
16
14
 
17
15
  from pyiceberg.schema import (
18
16
  Schema,
@@ -23,7 +21,7 @@ from pyiceberg.schema import (
23
21
  from pyiceberg.partitioning import PartitionSpec, PartitionField
24
22
  from pyiceberg.transforms import BucketTransform
25
23
 
26
- from deltacat.storage.iceberg.model import (
24
+ from deltacat.experimental.storage.iceberg.model import (
27
25
  SchemaMapper,
28
26
  PartitionSchemeMapper,
29
27
  )