deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
1
+ import time
2
+ import os
3
+ import posixpath
4
+ import pyarrow.fs
5
+ from pyarrow.fs import FileSelector, FileType
6
+ from itertools import chain
7
+ from deltacat.storage.model.transaction import Transaction
8
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
+ from deltacat.constants import (
10
+ TXN_DIR_NAME,
11
+ RUNNING_TXN_DIR_NAME,
12
+ FAILED_TXN_DIR_NAME,
13
+ TXN_PART_SEPARATOR,
14
+ )
15
+ from deltacat.storage.model.types import TransactionState
16
+ import logging
17
+ from deltacat import logs
18
+
19
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
+
21
+
22
+ def brute_force_search_matching_metafiles(
23
+ dirty_files_names, filesystem: pyarrow.fs.FileSystem, catalog_root
24
+ ):
25
+ txn_dir_name = TXN_DIR_NAME
26
+ # collect transaction ids of the files
27
+ transaction_ids = []
28
+ for dirty_file in dirty_files_names:
29
+ parts = dirty_file.split(TXN_PART_SEPARATOR)
30
+ if len(parts) < 2:
31
+ continue
32
+ transaction_ids.append(parts[1])
33
+
34
+ def recursive_search(path):
35
+ try:
36
+ selector = FileSelector(path, recursive=False)
37
+ entries = filesystem.get_file_info(selector)
38
+ except Exception as e:
39
+ logger.error(f"Error listing directory '{path}': {e}")
40
+ return
41
+
42
+ for entry in entries:
43
+ base_name = posixpath.basename(entry.path)
44
+ if entry.type == FileType.File:
45
+ for transaction_id in transaction_ids:
46
+ # Look for transaction_id in the filename
47
+ if transaction_id in base_name:
48
+ try:
49
+ filesystem.delete_file(entry.path)
50
+ logger.debug(f"Deleted file: {entry.path}")
51
+ except Exception as e:
52
+ logger.error(f"Error deleting file '{entry.path}': {e}")
53
+
54
+ elif entry.type == FileType.Directory:
55
+ # Skip directories that match txn_dir_name
56
+ if posixpath.basename(entry.path) == txn_dir_name:
57
+ logger.debug(f"Skipping directory: {entry.path}")
58
+ continue
59
+ recursive_search(entry.path)
60
+
61
+ # Start recursive search from the catalog root
62
+ recursive_search(catalog_root)
63
+
64
+ # renaming to successful completion
65
+ for dirty_file in dirty_files_names:
66
+ failed_txn_log_dir = posixpath.join(
67
+ catalog_root, TXN_DIR_NAME, FAILED_TXN_DIR_NAME
68
+ )
69
+ old_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
70
+
71
+ # new_filename = dirty_file.replace(TIMEOUT_TXN, SUCCESSFULLY_CLEANED)
72
+ new_log_path = posixpath.join(failed_txn_log_dir, dirty_file)
73
+ try:
74
+ filesystem.move(old_log_path, new_log_path)
75
+ logger.debug(f"Renamed file from {old_log_path} to {new_log_path}")
76
+ except Exception as e:
77
+ logger.error(f"Error renaming file '{old_log_path}': {e}")
78
+
79
+
80
+ def janitor_delete_timed_out_transaction(catalog_root: str) -> None:
81
+ """
82
+ Traverse the running transactions directory and move transactions that have been
83
+ running longer than the threshold into the failed transactions directory.
84
+ """
85
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
86
+
87
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
88
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
89
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
90
+
91
+ dirty_files = []
92
+
93
+ running_txn_file_selector = FileSelector(running_txn_log_dir, recursive=False)
94
+ running_txn_info_list = filesystem.get_file_info(running_txn_file_selector)
95
+
96
+ for running_txn_info in running_txn_info_list:
97
+ try:
98
+ filename = posixpath.basename(running_txn_info.path)
99
+ parts = filename.split(TXN_PART_SEPARATOR)
100
+ end_time_str = parts[-1]
101
+ end_time = float(end_time_str)
102
+ current_time = time.time_ns()
103
+ if end_time <= current_time:
104
+ src_path = running_txn_info.path
105
+ new_filename = f"{filename}"
106
+ dest_path = posixpath.join(failed_txn_log_dir, new_filename)
107
+
108
+ # Move the file using copy and delete
109
+ with filesystem.open_input_file(src_path) as src_file:
110
+ contents = src_file.read()
111
+
112
+ with filesystem.open_output_stream(dest_path) as dest_file:
113
+ dest_file.write(contents)
114
+ filesystem.delete_file(src_path)
115
+
116
+ dirty_files.append(new_filename)
117
+
118
+ except Exception as e:
119
+ logger.error(
120
+ f"Error cleaning failed transaction '{running_txn_info.path}': {e}"
121
+ )
122
+
123
+ # Pass catalog_root to the brute force search so it searches from the right place
124
+ brute_force_search_matching_metafiles(
125
+ dirty_files, filesystem, catalog_root_normalized
126
+ )
127
+
128
+
129
+ def janitor_remove_files_in_failed(
130
+ catalog_root: str, filesystem: pyarrow.fs.FileSystem = None
131
+ ) -> None:
132
+ """
133
+ Cleans up metafiles and locator files associated with failed transactions.
134
+ """
135
+ if filesystem is None:
136
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(catalog_root)
137
+ else:
138
+ catalog_root_normalized, filesystem = resolve_path_and_filesystem(
139
+ catalog_root, filesystem
140
+ )
141
+
142
+ txn_log_dir = posixpath.join(catalog_root_normalized, TXN_DIR_NAME)
143
+ failed_txn_log_dir = posixpath.join(txn_log_dir, FAILED_TXN_DIR_NAME)
144
+ running_txn_log_dir = posixpath.join(txn_log_dir, RUNNING_TXN_DIR_NAME)
145
+ filesystem.create_dir(failed_txn_log_dir, recursive=True)
146
+
147
+ failed_txn_file_selector = FileSelector(failed_txn_log_dir, recursive=False)
148
+ failed_txn_info_list = filesystem.get_file_info(failed_txn_file_selector)
149
+
150
+ for failed_txn_info in failed_txn_info_list:
151
+ try:
152
+ txn = Transaction.read(failed_txn_info.path, filesystem)
153
+ failed_txn_basename = posixpath.basename(failed_txn_info.path)
154
+ should_process = True
155
+ try:
156
+ if txn.state(catalog_root_normalized) == TransactionState.PURGED:
157
+ should_process = False
158
+ except Exception:
159
+ logger.error("Could not check attribute")
160
+ if should_process:
161
+ # Process if the file is marked as currently cleaning.
162
+ txnid = txn.id
163
+
164
+ if txn.state(catalog_root_normalized) == TransactionState.FAILED:
165
+
166
+ txnid = txn.id
167
+
168
+ operations = txn["operations"]
169
+ known_write_paths = chain.from_iterable(
170
+ (op["metafile_write_paths"] + op["locator_write_paths"])
171
+ for op in operations
172
+ )
173
+
174
+ for write_path in known_write_paths:
175
+ full_path = posixpath.join(catalog_root_normalized, write_path)
176
+ try:
177
+ filesystem.delete_file(full_path)
178
+ except Exception as e:
179
+ logger.error(f"Failed to delete file '{full_path}': {e}")
180
+
181
+ new_filename = f"{txnid}"
182
+
183
+ new_failed_txn_log_file_path = posixpath.join(
184
+ failed_txn_log_dir, new_filename
185
+ )
186
+ running_txn_log_path = posixpath.join(
187
+ running_txn_log_dir, new_filename
188
+ )
189
+
190
+ os.delete(running_txn_log_path)
191
+
192
+ os.rename(failed_txn_info.path, new_failed_txn_log_file_path)
193
+ logger.debug(
194
+ f"Cleaned up failed transaction: {failed_txn_basename}"
195
+ )
196
+
197
+ except Exception as e:
198
+ logger.error(
199
+ f"Could not read transaction '{failed_txn_info.path}', skipping: {e}"
200
+ )
201
+
202
+
203
+ def janitor_job(catalog_root_dir: str) -> None:
204
+ janitor_delete_timed_out_transaction(catalog_root_dir)
205
+ janitor_remove_files_in_failed(catalog_root_dir)
@@ -21,11 +21,16 @@ def _run_cmd(cmd: str) -> None:
21
21
  assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code}"
22
22
 
23
23
 
24
- def _ray_up(cluster_cfg: str, restart_only: bool = False) -> None:
24
+ def _ray_up(
25
+ cluster_cfg: str, cluster_name_override: str = None, restart_only: bool = False
26
+ ) -> None:
25
27
  restart_flag = "--no-restart" if not restart_only else "--restart-only"
28
+ cluster_name_option = (
29
+ f"-n '{cluster_name_override}'" if cluster_name_override else ""
30
+ )
26
31
  print(f"Starting Ray cluster from '{cluster_cfg}'")
27
32
  _run_cmd(
28
- f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} --disable-usage-stats"
33
+ f"ray up '{cluster_cfg}' -y --no-config-cache {restart_flag} {cluster_name_option} --disable-usage-stats"
29
34
  )
30
35
  print(f"Started Ray cluster from '{cluster_cfg}'")
31
36
 
@@ -123,6 +128,7 @@ class DeltaCatJobClient(JobSubmissionClient):
123
128
  head_node_ip: str = None,
124
129
  dashboard_wait_time_seconds: int = 30,
125
130
  port: Union[int, str] = "8265",
131
+ cluster_name_override: str = None,
126
132
  ):
127
133
  job_submission_client_url = None
128
134
  try:
@@ -130,10 +136,12 @@ class DeltaCatJobClient(JobSubmissionClient):
130
136
  if cluster_cfg_file_path:
131
137
  if launch_cluster:
132
138
  if not _ray_cluster_running(cluster_cfg_file_path) or restart_ray:
133
- _ray_up(cluster_cfg_file_path)
139
+ _ray_up(cluster_cfg_file_path, cluster_name_override)
134
140
  elif restart_ray:
135
141
  if _ray_cluster_running(cluster_cfg_file_path):
136
- _ray_up(cluster_cfg_file_path, restart_ray)
142
+ _ray_up(
143
+ cluster_cfg_file_path, restart_ray, cluster_name_override
144
+ )
137
145
  else:
138
146
  raise RuntimeError(
139
147
  f"Cannot Restart Ray: Ray Cluster for "
@@ -336,10 +344,11 @@ def local_job_client(*args, **kwargs) -> DeltaCatJobClient:
336
344
  Raises:
337
345
  RuntimeError: If a local Ray Job Server cannot be found.
338
346
  """
339
- if not dc.is_initialized():
340
- context = dc.init(*args, **kwargs)
341
- else:
342
- context = dc.init()
347
+ # force reinitialization to ensure that we can get the Ray context
348
+ kwargs["force"] = True
349
+ context = dc.init(*args, **kwargs)
350
+ if context is None:
351
+ raise RuntimeError("Failed to retrieve Ray context.")
343
352
  if context.dashboard_url:
344
353
  head_node_ip, port = context.dashboard_url.split(":")
345
354
  else:
@@ -366,6 +375,7 @@ def job_client(
366
375
  head_node_ip: str = None,
367
376
  dashboard_wait_time_seconds: int = 15,
368
377
  port: Union[str, int] = "8265",
378
+ cluster_name_override: str = None,
369
379
  ) -> DeltaCatJobClient:
370
380
  """
371
381
  Create a DeltaCAT Job Client that can be used to submit jobs to a remote
@@ -403,4 +413,5 @@ def job_client(
403
413
  head_node_ip=head_node_ip,
404
414
  dashboard_wait_time_seconds=dashboard_wait_time_seconds,
405
415
  port=port,
416
+ cluster_name_override=cluster_name_override,
406
417
  )
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Optional, Dict, Any
2
+ from typing import Optional, Dict, Any, List
3
3
  from deltacat import logs
4
4
  from deltacat.storage import (
5
5
  Delta,
@@ -61,6 +61,7 @@ def _estimate_resources_required_to_process_delta_using_previous_inflation(
61
61
  def _estimate_resources_required_to_process_delta_using_type_params(
62
62
  delta: Delta,
63
63
  operation_type: OperationType,
64
+ all_column_names: List[str],
64
65
  estimate_resources_params: EstimateResourcesParams,
65
66
  deltacat_storage: unimplemented_deltacat_storage,
66
67
  deltacat_storage_kwargs: Dict[str, Any],
@@ -93,11 +94,30 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
94
  on_disk_size_bytes=delta.meta.content_length,
94
95
  ),
95
96
  )
96
-
97
+ file_reader_kwargs_provider = kwargs.get(
98
+ "file_reader_kwargs_provider"
99
+ ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
100
+
101
+ """
102
+ NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
103
+ 1. Nested within deltacat_storage_kwargs during resource estimation
104
+ 2. As a top-level attribute of CompactPartitionsParams during compaction
105
+
106
+ This creates an inconsistent parameter path between resource estimation and compaction flows.
107
+ As a long-term solution, this should be unified to use a single consistent path (either always
108
+ nested in deltacat_storage_kwargs or always as a top-level parameter).
109
+
110
+ For now, this implementation handles the resource estimation case by:
111
+ 1. First checking for file_reader_kwargs_provider as a direct kwarg
112
+ 2. Falling back to deltacat_storage_kwargs if not found
113
+ This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
114
+ """
97
115
  appended = append_content_type_params(
98
116
  delta=delta,
117
+ all_column_names=all_column_names,
99
118
  deltacat_storage=deltacat_storage,
100
119
  deltacat_storage_kwargs=deltacat_storage_kwargs,
120
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
101
121
  )
102
122
 
103
123
  if not appended:
@@ -152,6 +172,10 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
152
172
  operation_type == OperationType.PYARROW_DOWNLOAD
153
173
  ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
154
174
 
175
+ if not estimate_resources_params.max_files_to_sample:
176
+ # we cannot calculate if we cannot sample
177
+ return None
178
+
155
179
  if not delta.manifest:
156
180
  delta.manifest = deltacat_storage.get_delta_manifest(
157
181
  delta.locator,
@@ -168,10 +192,6 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
168
192
  ),
169
193
  )
170
194
 
171
- if not estimate_resources_params.max_files_to_sample:
172
- # we cannot calculate if we cannot sample
173
- return None
174
-
175
195
  sampled_in_memory_size = 0.0
176
196
  sampled_on_disk_size = 0.0
177
197
  sampled_num_rows = 0
@@ -234,6 +254,10 @@ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
234
254
  _estimate_resources_required_to_process_delta_using_file_sampling,
235
255
  _estimate_resources_required_to_process_delta_using_previous_inflation,
236
256
  ],
257
+ ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION: [
258
+ _estimate_resources_required_to_process_delta_using_file_sampling,
259
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
260
+ ],
237
261
  }
238
262
 
239
263
 
@@ -267,10 +291,18 @@ def estimate_resources_required_to_process_delta(
267
291
  estimate_resources_params.resource_estimation_method
268
292
  )
269
293
 
294
+ all_column_names = deltacat_storage.get_table_version_column_names(
295
+ delta.locator.namespace,
296
+ delta.locator.table_name,
297
+ delta.locator.table_version,
298
+ **deltacat_storage_kwargs,
299
+ )
300
+
270
301
  for func in functions:
271
302
  resources = func(
272
303
  delta=delta,
273
304
  operation_type=operation_type,
305
+ all_column_names=all_column_names,
274
306
  estimate_resources_params=estimate_resources_params,
275
307
  deltacat_storage=deltacat_storage,
276
308
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -23,6 +23,14 @@ class ResourceEstimationMethod(str, Enum):
23
23
  """
24
24
  DEFAULT_V2 = "DEFAULT_V2"
25
25
 
26
+ """
27
+ This approach combines file sampling estimation and inflation based methods
28
+ and runs them in the order specified below:
29
+ 1. FILE_SAMPLING
30
+ 2. PREVIOUS_INFLATION
31
+ """
32
+ FILE_SAMPLING_WITH_PREVIOUS_INFLATION = "FILE_SAMPLING_WITH_PREVIOUS_INFLATION"
33
+
26
34
  """
27
35
  This approach strictly uses previous inflation and average record size to arrive
28
36
  at a resource estimate. It requires users to pass in previous inflation and average
deltacat/constants.py CHANGED
@@ -1,7 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import botocore.exceptions
3
4
 
5
+ from daft.exceptions import DaftTransientError
4
6
  from deltacat.utils.common import env_string, env_bool
7
+ from deltacat.utils.common import env_integer
5
8
 
6
9
  # Environment variables
7
10
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -91,7 +94,10 @@ REVISION_DIR_NAME: str = "rev"
91
94
  TXN_DIR_NAME: str = "txn"
92
95
  RUNNING_TXN_DIR_NAME: str = "running"
93
96
  FAILED_TXN_DIR_NAME: str = "failed"
97
+ PAUSED_TXN_DIR_NAME: str = "paused"
94
98
  SUCCESS_TXN_DIR_NAME: str = "success"
99
+ DATA_FILE_DIR_NAME: str = "data"
100
+ REV_DIR_NAME: str = "rev"
95
101
  TXN_PART_SEPARATOR = "_"
96
102
 
97
103
  # Storage interface defaults
@@ -103,3 +109,41 @@ DEFAULT_TABLE_VERSION = "1"
103
109
  DEFAULT_STREAM_ID = "stream"
104
110
  DEFAULT_PARTITION_ID = "partition"
105
111
  DEFAULT_PARTITION_VALUES = ["default"]
112
+
113
+ # Transaction Status constants
114
+ SUCCESSFULLY_CLEANED = "cleaned"
115
+ CURRENTLY_CLEANING = "cleaning"
116
+ TIMEOUT_TXN = "timedout"
117
+
118
+ # operation timeout constants
119
+ OPERATION_TIMEOUTS = {
120
+ "create": 5,
121
+ "update": 3,
122
+ "delete": 4,
123
+ "read_siblings": 2,
124
+ "read_children": 2,
125
+ "read_latest": 3,
126
+ "read_exists": 1,
127
+ }
128
+ # Upload/Download Retry Defaults
129
+ UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
130
+ "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
131
+ )
132
+ UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
133
+ "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
134
+ )
135
+ DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
136
+ "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
137
+ )
138
+ DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
139
+ "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
140
+ ) # 5 mins
141
+ RETRYABLE_TRANSIENT_ERRORS = (
142
+ OSError,
143
+ botocore.exceptions.ConnectionError,
144
+ botocore.exceptions.HTTPClientError,
145
+ botocore.exceptions.NoCredentialsError,
146
+ botocore.exceptions.ConnectTimeoutError,
147
+ botocore.exceptions.ReadTimeoutError,
148
+ DaftTransientError,
149
+ )
File without changes
File without changes