nurion-raydp 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
raydp/utils.py ADDED
@@ -0,0 +1,359 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import os
19
+ import atexit
20
+ import logging
21
+ import math
22
+ import glob
23
+ import re
24
+ import signal
25
+ from dataclasses import dataclass
26
+ from typing import Dict, List, Optional, Tuple
27
+
28
+ import ray
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ MEMORY_SIZE_UNITS = {"K": 2**10, "M": 2**20, "G": 2**30, "T": 2**40}
33
+
34
+ # we use 4 bytes for block size, this means each block can contain
35
+ # 4294967296 records
36
+ BLOCK_SIZE_BIT = 32
37
+
38
+
39
+ def register_exit_handler(func):
40
+ atexit.register(func)
41
+ signal.signal(signal.SIGTERM, func)
42
+ signal.signal(signal.SIGINT, func)
43
+
44
+
45
+ def random_split(df, weights, seed=None):
46
+ """
47
+ Random split the spark DataFrame or koalas DataFrame into given part
48
+ :param df: the spark DataFrame or koalas DataFrame
49
+ :param weights: list of doubles as weights with which to split the df.
50
+ Weights will be normalized if they don't sum up to 1.0.
51
+ :param seed: The seed for sampling.
52
+ """
53
+ # convert to Spark DataFrame
54
+ df, is_spark_df = convert_to_spark(df)
55
+ splits = df.randomSplit(weights, seed)
56
+ if is_spark_df:
57
+ return splits
58
+ else:
59
+ # convert back to pandas on Spark DataFrame
60
+ import pyspark.pandas as ps # pylint: disable=C0415
61
+
62
+ return [ps.DataFrame(split) for split in splits]
63
+
64
+
65
+ def _df_helper(df, spark_callback, spark_pandas_callback):
66
+ try:
67
+ import pyspark # pylint: disable=C0415
68
+ except Exception:
69
+ pass
70
+ else:
71
+ if isinstance(df, pyspark.sql.DataFrame):
72
+ return spark_callback(df)
73
+
74
+ try:
75
+ import pyspark.pandas as ps # pylint: disable=C0415
76
+ except Exception:
77
+ pass
78
+ else:
79
+ if isinstance(df, ps.DataFrame):
80
+ return spark_pandas_callback(df)
81
+
82
+ raise Exception(
83
+ f"The type: {type(df)} is not supported, only support "
84
+ "pyspark.sql.DataFrame and pyspark.pandas.DataFrame"
85
+ )
86
+
87
+
88
+ def df_type_check(df):
89
+ """
90
+ Check whether the df is spark DataFrame or koalas DataFrame.
91
+ :return True for spark DataFrame or Koalas DataFrame.
92
+ :raise Exception when it is neither spark DataFrame nor Koalas DataFrame.
93
+ """
94
+ return _df_helper(df, lambda d: True, lambda d: True)
95
+
96
+
97
+ def convert_to_spark(df):
98
+ """
99
+ Do nothing if the df is spark DataFrame, convert to spark DataFrame if it is
100
+ koalas DataFrame. Raise Exception otherwise.
101
+ :return: a pair of (converted df, whether it is spark DataFrame)
102
+ """
103
+ return _df_helper(df, lambda d: (d, True), lambda d: (d.to_spark(), False))
104
+
105
+
106
+ def parse_memory_size(memory_size: str) -> int:
107
+ """
108
+ Parse the human readable memory size into bytes.
109
+ Adapt from: https://stackoverflow.com/a/60708339
110
+ :param memory_size: human readable memory size
111
+ :return: convert to int size
112
+ """
113
+ memory_size = memory_size.strip().upper()
114
+ if re.search(r"B", memory_size):
115
+ # discard "B"
116
+ memory_size = re.sub(r"B", "", memory_size)
117
+
118
+ try:
119
+ return int(memory_size)
120
+ except ValueError:
121
+ pass
122
+
123
+ global MEMORY_SIZE_UNITS
124
+ if not re.search(r" ", memory_size):
125
+ memory_size = re.sub(r"([KMGT]+)", r" \1", memory_size)
126
+ number, unit_index = [item.strip() for item in memory_size.split()]
127
+ return int(float(number) * MEMORY_SIZE_UNITS[unit_index])
128
+
129
+
130
+ def divide_blocks(
131
+ blocks: List[int], world_size: int, shuffle: bool = False, shuffle_seed: int = None
132
+ ) -> Dict[int, List[int]]:
133
+ """
134
+ Divide the blocks into world_size partitions, and return the divided block indexes for the
135
+ given work_rank
136
+ :param blocks: the blocks and each item is the given block size
137
+ :param world_size: total world size
138
+ :param shuffle: whether shuffle the blocks before divide
139
+ :param shuffle_seed: the shuffle seed
140
+ :return: a dict, the key is the world rank, and the value is a list of pair of block index
141
+ and the samples selected in that block
142
+ """
143
+ import numpy as np
144
+
145
+ if len(blocks) < world_size:
146
+ raise Exception("do not have enough blocks to divide")
147
+
148
+ results = {}
149
+
150
+ # number of blocks per rank
151
+ num_blocks_per_rank = int(math.ceil(len(blocks) * 1.0 / world_size))
152
+ # number of samples per rank
153
+ num_samples_per_rank = int(math.ceil(sum(blocks) * 1.0 / world_size))
154
+ # total number of blocks
155
+ total_num_blocks = num_blocks_per_rank * world_size
156
+ # global block indexes
157
+ global_indexes = list(range(len(blocks)))
158
+
159
+ # add extra blocks to make it evenly divisible
160
+ if len(global_indexes) != total_num_blocks:
161
+ global_indexes += global_indexes[: (total_num_blocks - len(global_indexes))]
162
+
163
+ assert len(global_indexes) == total_num_blocks
164
+
165
+ if shuffle_seed:
166
+ np.random.seed(shuffle_seed)
167
+ else:
168
+ np.random.seed(0)
169
+
170
+ if shuffle:
171
+ np.random.shuffle(global_indexes)
172
+
173
+ def select(index: int, current_size: int, selected: List[Tuple[int, int]]) -> int:
174
+ block_size = blocks[index]
175
+ tmp = current_size + block_size
176
+ if tmp < num_samples_per_rank:
177
+ selected.append((index, block_size))
178
+ current_size = tmp
179
+ elif tmp >= num_samples_per_rank:
180
+ selected.append((index, (num_samples_per_rank - current_size)))
181
+ current_size = num_samples_per_rank
182
+ return current_size
183
+
184
+ for rank in range(world_size):
185
+ indexes = global_indexes[rank:total_num_blocks:world_size]
186
+ assert len(indexes) == num_blocks_per_rank
187
+
188
+ samples_cur_rank = 0
189
+ selected_indexes = []
190
+ for i in indexes:
191
+ samples_cur_rank = select(i, samples_cur_rank, selected_indexes)
192
+ if samples_cur_rank == num_samples_per_rank:
193
+ break
194
+
195
+ while samples_cur_rank < num_samples_per_rank:
196
+ index = np.random.choice(global_indexes, size=1)[0]
197
+ samples_cur_rank = select(index, samples_cur_rank, selected_indexes)
198
+
199
+ assert samples_cur_rank == num_samples_per_rank
200
+
201
+ results[rank] = selected_indexes
202
+
203
+ return results
204
+
205
+
206
+ def code_search_path() -> List[str]:
207
+ import pyspark
208
+
209
+ raydp_cp = os.path.abspath(os.path.join(os.path.abspath(__file__), "../jars/"))
210
+ spark_home = os.environ.get("SPARK_HOME", os.path.dirname(pyspark.__file__))
211
+ spark_jars_dir = os.path.abspath(os.path.join(spark_home, "jars/"))
212
+
213
+ return [raydp_cp, spark_jars_dir]
214
+
215
+
216
+ def code_search_jars() -> List[str]:
217
+ paths = code_search_path()
218
+ jars = []
219
+ for path in paths:
220
+ jars.extend(glob.glob(os.path.join(path, "*.jar")))
221
+ return jars
222
+
223
+
224
+ @dataclass
225
+ class ExecutorConfig:
226
+ """Auto-inferred executor configuration based on Ray cluster resources."""
227
+
228
+ num_executors: int
229
+ executor_cores: int
230
+ executor_memory_gb: int
231
+ driver_memory_gb: int
232
+
233
+ @property
234
+ def executor_memory(self) -> str:
235
+ """Return executor memory as a Spark-compatible string (e.g., '4g')."""
236
+ return f"{self.executor_memory_gb}g"
237
+
238
+ @property
239
+ def driver_memory(self) -> str:
240
+ """Return driver memory as a Spark-compatible string (e.g., '2g')."""
241
+ return f"{self.driver_memory_gb}g"
242
+
243
+
244
+ def auto_infer_executor_config(
245
+ cpu_overhead_per_executor: int = 1,
246
+ memory_overhead_gb_per_executor: int = 2,
247
+ min_executor_cores: int = 1,
248
+ min_executor_memory_gb: int = 4,
249
+ min_driver_memory_gb: int = 2,
250
+ ) -> ExecutorConfig:
251
+ """
252
+ Automatically infer Spark executor configuration based on Ray cluster resources.
253
+
254
+ This function analyzes the Ray cluster topology to determine optimal Spark executor
255
+ settings. It distinguishes between head and worker nodes, using worker nodes for
256
+ executors and the head node for the driver.
257
+
258
+ Args:
259
+ cpu_overhead_per_executor: Number of CPUs to reserve per executor for system overhead.
260
+ memory_overhead_gb_per_executor: GB of memory to reserve per executor for overhead.
261
+ min_executor_cores: Minimum number of cores per executor.
262
+ min_executor_memory_gb: Minimum memory (GB) per executor.
263
+ min_driver_memory_gb: Minimum memory (GB) for the driver.
264
+ logger: Optional logger for debug output.
265
+
266
+ Returns:
267
+ ExecutorConfig with inferred settings.
268
+
269
+ Example:
270
+ >>> config = auto_infer_executor_config()
271
+ >>> spark = init_spark(
272
+ ... app_name="my_app",
273
+ ... num_executors=config.num_executors,
274
+ ... executor_cores=config.executor_cores,
275
+ ... executor_memory=config.executor_memory,
276
+ ... )
277
+ """
278
+ if not ray.is_initialized():
279
+ raise RuntimeError(
280
+ "Ray must be initialized before calling auto_infer_executor_config. "
281
+ "Call ray.init() first."
282
+ )
283
+
284
+ # Get per-node resources using ray.nodes() for precise calculation
285
+ nodes = ray.nodes()
286
+
287
+ head_cpus = 0
288
+ head_memory_gb = 0
289
+ worker_nodes: List[Dict[str, int]] = []
290
+
291
+ for node in nodes:
292
+ if not node.get("Alive", False):
293
+ continue
294
+ node_resources = node.get("Resources", {})
295
+ node_cpus = int(node_resources.get("CPU", 0))
296
+ node_memory_gb = int(node_resources.get("memory", 0) / (1024 * 1024 * 1024))
297
+
298
+ # Head node has 'node:__internal_head__' resource
299
+ if "node:__internal_head__" in node_resources:
300
+ head_cpus = node_cpus
301
+ head_memory_gb = node_memory_gb
302
+ logger.debug(f"Head node: {node_cpus} CPUs, {node_memory_gb}GB memory")
303
+ else:
304
+ worker_nodes.append({"cpus": node_cpus, "memory_gb": node_memory_gb})
305
+ logger.debug(f"Worker node: {node_cpus} CPUs, {node_memory_gb}GB memory")
306
+
307
+ num_workers = len(worker_nodes)
308
+ if num_workers == 0:
309
+ # Fallback: treat all resources as single executor (local mode)
310
+ total_resources = ray.cluster_resources()
311
+ executor_cores = max(
312
+ int(total_resources.get("CPU", 4)) - cpu_overhead_per_executor,
313
+ min_executor_cores,
314
+ )
315
+ executor_memory_gb = max(
316
+ int(total_resources.get("memory", 8 * 1024**3) / 1024**3)
317
+ - memory_overhead_gb_per_executor,
318
+ min_executor_memory_gb,
319
+ )
320
+ num_executors = 1
321
+ driver_memory_gb = min_driver_memory_gb
322
+ logger.info(
323
+ f"Local mode detected: 1 executor with {executor_cores} cores, "
324
+ f"{executor_memory_gb}GB memory"
325
+ )
326
+ else:
327
+ # Use minimum worker resources to ensure all executors can be scheduled
328
+ min_worker_cpus = min(w["cpus"] for w in worker_nodes)
329
+ min_worker_memory_gb = min(w["memory_gb"] for w in worker_nodes)
330
+
331
+ # Executor config: leave overhead for system processes per worker
332
+ executor_cores = max(
333
+ min_worker_cpus - cpu_overhead_per_executor,
334
+ min_executor_cores,
335
+ )
336
+ executor_memory_gb = max(
337
+ min_worker_memory_gb - memory_overhead_gb_per_executor,
338
+ min_executor_memory_gb,
339
+ )
340
+ num_executors = num_workers
341
+
342
+ # Driver on head: use half of head memory
343
+ driver_memory_gb = max(head_memory_gb // 2, min_driver_memory_gb)
344
+
345
+ logger.info(
346
+ f"Cluster: {num_workers} workers, head={head_cpus}CPU/{head_memory_gb}GB"
347
+ )
348
+
349
+ logger.info(
350
+ f"Auto-configured: {num_executors} executors, {executor_cores} cores each, "
351
+ f"{executor_memory_gb}g memory, driver {driver_memory_gb}g"
352
+ )
353
+
354
+ return ExecutorConfig(
355
+ num_executors=num_executors,
356
+ executor_cores=executor_cores,
357
+ executor_memory_gb=executor_memory_gb,
358
+ driver_memory_gb=driver_memory_gb,
359
+ )