nurion-raydp 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nurion_raydp-1.7.0.dist-info/METADATA +12 -0
- nurion_raydp-1.7.0.dist-info/RECORD +19 -0
- nurion_raydp-1.7.0.dist-info/WHEEL +5 -0
- nurion_raydp-1.7.0.dist-info/top_level.txt +1 -0
- raydp/__init__.py +26 -0
- raydp/_build_hooks.py +139 -0
- raydp/context.py +238 -0
- raydp/jars/__init__.py +0 -0
- raydp/jars/raydp-1.7.0-SNAPSHOT.jar +0 -0
- raydp/jars/raydp-shims-common-1.7.0-SNAPSHOT.jar +0 -0
- raydp/jars/raydp-shims-spark340-1.7.0-SNAPSHOT.jar +0 -0
- raydp/jars/raydp-shims-spark350-1.7.0-SNAPSHOT.jar +0 -0
- raydp/setup.py +44 -0
- raydp/spark/__init__.py +34 -0
- raydp/spark/dataset.py +232 -0
- raydp/spark/ray_cluster.py +162 -0
- raydp/spark/ray_cluster_master.py +102 -0
- raydp/spark/ray_pyworker.py +135 -0
- raydp/utils.py +359 -0
raydp/utils.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
|
+
# contributor license agreements. See the NOTICE file distributed with
|
|
4
|
+
# this work for additional information regarding copyright ownership.
|
|
5
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
6
|
+
# (the "License"); you may not use this file except in compliance with
|
|
7
|
+
# the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import atexit
|
|
20
|
+
import logging
|
|
21
|
+
import math
|
|
22
|
+
import glob
|
|
23
|
+
import re
|
|
24
|
+
import signal
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Dict, List, Optional, Tuple
|
|
27
|
+
|
|
28
|
+
import ray
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
MEMORY_SIZE_UNITS = {"K": 2**10, "M": 2**20, "G": 2**30, "T": 2**40}
|
|
33
|
+
|
|
34
|
+
# we use 4 bytes for block size, this means each block can contain
|
|
35
|
+
# 4294967296 records
|
|
36
|
+
BLOCK_SIZE_BIT = 32
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def register_exit_handler(func):
|
|
40
|
+
atexit.register(func)
|
|
41
|
+
signal.signal(signal.SIGTERM, func)
|
|
42
|
+
signal.signal(signal.SIGINT, func)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def random_split(df, weights, seed=None):
|
|
46
|
+
"""
|
|
47
|
+
Random split the spark DataFrame or koalas DataFrame into given part
|
|
48
|
+
:param df: the spark DataFrame or koalas DataFrame
|
|
49
|
+
:param weights: list of doubles as weights with which to split the df.
|
|
50
|
+
Weights will be normalized if they don't sum up to 1.0.
|
|
51
|
+
:param seed: The seed for sampling.
|
|
52
|
+
"""
|
|
53
|
+
# convert to Spark DataFrame
|
|
54
|
+
df, is_spark_df = convert_to_spark(df)
|
|
55
|
+
splits = df.randomSplit(weights, seed)
|
|
56
|
+
if is_spark_df:
|
|
57
|
+
return splits
|
|
58
|
+
else:
|
|
59
|
+
# convert back to pandas on Spark DataFrame
|
|
60
|
+
import pyspark.pandas as ps # pylint: disable=C0415
|
|
61
|
+
|
|
62
|
+
return [ps.DataFrame(split) for split in splits]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _df_helper(df, spark_callback, spark_pandas_callback):
|
|
66
|
+
try:
|
|
67
|
+
import pyspark # pylint: disable=C0415
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
else:
|
|
71
|
+
if isinstance(df, pyspark.sql.DataFrame):
|
|
72
|
+
return spark_callback(df)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
import pyspark.pandas as ps # pylint: disable=C0415
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
else:
|
|
79
|
+
if isinstance(df, ps.DataFrame):
|
|
80
|
+
return spark_pandas_callback(df)
|
|
81
|
+
|
|
82
|
+
raise Exception(
|
|
83
|
+
f"The type: {type(df)} is not supported, only support "
|
|
84
|
+
"pyspark.sql.DataFrame and pyspark.pandas.DataFrame"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def df_type_check(df):
|
|
89
|
+
"""
|
|
90
|
+
Check whether the df is spark DataFrame or koalas DataFrame.
|
|
91
|
+
:return True for spark DataFrame or Koalas DataFrame.
|
|
92
|
+
:raise Exception when it is neither spark DataFrame nor Koalas DataFrame.
|
|
93
|
+
"""
|
|
94
|
+
return _df_helper(df, lambda d: True, lambda d: True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def convert_to_spark(df):
|
|
98
|
+
"""
|
|
99
|
+
Do nothing if the df is spark DataFrame, convert to spark DataFrame if it is
|
|
100
|
+
koalas DataFrame. Raise Exception otherwise.
|
|
101
|
+
:return: a pair of (converted df, whether it is spark DataFrame)
|
|
102
|
+
"""
|
|
103
|
+
return _df_helper(df, lambda d: (d, True), lambda d: (d.to_spark(), False))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_memory_size(memory_size: str) -> int:
|
|
107
|
+
"""
|
|
108
|
+
Parse the human readable memory size into bytes.
|
|
109
|
+
Adapt from: https://stackoverflow.com/a/60708339
|
|
110
|
+
:param memory_size: human readable memory size
|
|
111
|
+
:return: convert to int size
|
|
112
|
+
"""
|
|
113
|
+
memory_size = memory_size.strip().upper()
|
|
114
|
+
if re.search(r"B", memory_size):
|
|
115
|
+
# discard "B"
|
|
116
|
+
memory_size = re.sub(r"B", "", memory_size)
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
return int(memory_size)
|
|
120
|
+
except ValueError:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
global MEMORY_SIZE_UNITS
|
|
124
|
+
if not re.search(r" ", memory_size):
|
|
125
|
+
memory_size = re.sub(r"([KMGT]+)", r" \1", memory_size)
|
|
126
|
+
number, unit_index = [item.strip() for item in memory_size.split()]
|
|
127
|
+
return int(float(number) * MEMORY_SIZE_UNITS[unit_index])
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def divide_blocks(
|
|
131
|
+
blocks: List[int], world_size: int, shuffle: bool = False, shuffle_seed: int = None
|
|
132
|
+
) -> Dict[int, List[int]]:
|
|
133
|
+
"""
|
|
134
|
+
Divide the blocks into world_size partitions, and return the divided block indexes for the
|
|
135
|
+
given work_rank
|
|
136
|
+
:param blocks: the blocks and each item is the given block size
|
|
137
|
+
:param world_size: total world size
|
|
138
|
+
:param shuffle: whether shuffle the blocks before divide
|
|
139
|
+
:param shuffle_seed: the shuffle seed
|
|
140
|
+
:return: a dict, the key is the world rank, and the value is a list of pair of block index
|
|
141
|
+
and the samples selected in that block
|
|
142
|
+
"""
|
|
143
|
+
import numpy as np
|
|
144
|
+
|
|
145
|
+
if len(blocks) < world_size:
|
|
146
|
+
raise Exception("do not have enough blocks to divide")
|
|
147
|
+
|
|
148
|
+
results = {}
|
|
149
|
+
|
|
150
|
+
# number of blocks per rank
|
|
151
|
+
num_blocks_per_rank = int(math.ceil(len(blocks) * 1.0 / world_size))
|
|
152
|
+
# number of samples per rank
|
|
153
|
+
num_samples_per_rank = int(math.ceil(sum(blocks) * 1.0 / world_size))
|
|
154
|
+
# total number of blocks
|
|
155
|
+
total_num_blocks = num_blocks_per_rank * world_size
|
|
156
|
+
# global block indexes
|
|
157
|
+
global_indexes = list(range(len(blocks)))
|
|
158
|
+
|
|
159
|
+
# add extra blocks to make it evenly divisible
|
|
160
|
+
if len(global_indexes) != total_num_blocks:
|
|
161
|
+
global_indexes += global_indexes[: (total_num_blocks - len(global_indexes))]
|
|
162
|
+
|
|
163
|
+
assert len(global_indexes) == total_num_blocks
|
|
164
|
+
|
|
165
|
+
if shuffle_seed:
|
|
166
|
+
np.random.seed(shuffle_seed)
|
|
167
|
+
else:
|
|
168
|
+
np.random.seed(0)
|
|
169
|
+
|
|
170
|
+
if shuffle:
|
|
171
|
+
np.random.shuffle(global_indexes)
|
|
172
|
+
|
|
173
|
+
def select(index: int, current_size: int, selected: List[Tuple[int, int]]) -> int:
|
|
174
|
+
block_size = blocks[index]
|
|
175
|
+
tmp = current_size + block_size
|
|
176
|
+
if tmp < num_samples_per_rank:
|
|
177
|
+
selected.append((index, block_size))
|
|
178
|
+
current_size = tmp
|
|
179
|
+
elif tmp >= num_samples_per_rank:
|
|
180
|
+
selected.append((index, (num_samples_per_rank - current_size)))
|
|
181
|
+
current_size = num_samples_per_rank
|
|
182
|
+
return current_size
|
|
183
|
+
|
|
184
|
+
for rank in range(world_size):
|
|
185
|
+
indexes = global_indexes[rank:total_num_blocks:world_size]
|
|
186
|
+
assert len(indexes) == num_blocks_per_rank
|
|
187
|
+
|
|
188
|
+
samples_cur_rank = 0
|
|
189
|
+
selected_indexes = []
|
|
190
|
+
for i in indexes:
|
|
191
|
+
samples_cur_rank = select(i, samples_cur_rank, selected_indexes)
|
|
192
|
+
if samples_cur_rank == num_samples_per_rank:
|
|
193
|
+
break
|
|
194
|
+
|
|
195
|
+
while samples_cur_rank < num_samples_per_rank:
|
|
196
|
+
index = np.random.choice(global_indexes, size=1)[0]
|
|
197
|
+
samples_cur_rank = select(index, samples_cur_rank, selected_indexes)
|
|
198
|
+
|
|
199
|
+
assert samples_cur_rank == num_samples_per_rank
|
|
200
|
+
|
|
201
|
+
results[rank] = selected_indexes
|
|
202
|
+
|
|
203
|
+
return results
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def code_search_path() -> List[str]:
|
|
207
|
+
import pyspark
|
|
208
|
+
|
|
209
|
+
raydp_cp = os.path.abspath(os.path.join(os.path.abspath(__file__), "../jars/"))
|
|
210
|
+
spark_home = os.environ.get("SPARK_HOME", os.path.dirname(pyspark.__file__))
|
|
211
|
+
spark_jars_dir = os.path.abspath(os.path.join(spark_home, "jars/"))
|
|
212
|
+
|
|
213
|
+
return [raydp_cp, spark_jars_dir]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def code_search_jars() -> List[str]:
|
|
217
|
+
paths = code_search_path()
|
|
218
|
+
jars = []
|
|
219
|
+
for path in paths:
|
|
220
|
+
jars.extend(glob.glob(os.path.join(path, "*.jar")))
|
|
221
|
+
return jars
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class ExecutorConfig:
|
|
226
|
+
"""Auto-inferred executor configuration based on Ray cluster resources."""
|
|
227
|
+
|
|
228
|
+
num_executors: int
|
|
229
|
+
executor_cores: int
|
|
230
|
+
executor_memory_gb: int
|
|
231
|
+
driver_memory_gb: int
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def executor_memory(self) -> str:
|
|
235
|
+
"""Return executor memory as a Spark-compatible string (e.g., '4g')."""
|
|
236
|
+
return f"{self.executor_memory_gb}g"
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def driver_memory(self) -> str:
|
|
240
|
+
"""Return driver memory as a Spark-compatible string (e.g., '2g')."""
|
|
241
|
+
return f"{self.driver_memory_gb}g"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def auto_infer_executor_config(
|
|
245
|
+
cpu_overhead_per_executor: int = 1,
|
|
246
|
+
memory_overhead_gb_per_executor: int = 2,
|
|
247
|
+
min_executor_cores: int = 1,
|
|
248
|
+
min_executor_memory_gb: int = 4,
|
|
249
|
+
min_driver_memory_gb: int = 2,
|
|
250
|
+
) -> ExecutorConfig:
|
|
251
|
+
"""
|
|
252
|
+
Automatically infer Spark executor configuration based on Ray cluster resources.
|
|
253
|
+
|
|
254
|
+
This function analyzes the Ray cluster topology to determine optimal Spark executor
|
|
255
|
+
settings. It distinguishes between head and worker nodes, using worker nodes for
|
|
256
|
+
executors and the head node for the driver.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
cpu_overhead_per_executor: Number of CPUs to reserve per executor for system overhead.
|
|
260
|
+
memory_overhead_gb_per_executor: GB of memory to reserve per executor for overhead.
|
|
261
|
+
min_executor_cores: Minimum number of cores per executor.
|
|
262
|
+
min_executor_memory_gb: Minimum memory (GB) per executor.
|
|
263
|
+
min_driver_memory_gb: Minimum memory (GB) for the driver.
|
|
264
|
+
logger: Optional logger for debug output.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
ExecutorConfig with inferred settings.
|
|
268
|
+
|
|
269
|
+
Example:
|
|
270
|
+
>>> config = auto_infer_executor_config()
|
|
271
|
+
>>> spark = init_spark(
|
|
272
|
+
... app_name="my_app",
|
|
273
|
+
... num_executors=config.num_executors,
|
|
274
|
+
... executor_cores=config.executor_cores,
|
|
275
|
+
... executor_memory=config.executor_memory,
|
|
276
|
+
... )
|
|
277
|
+
"""
|
|
278
|
+
if not ray.is_initialized():
|
|
279
|
+
raise RuntimeError(
|
|
280
|
+
"Ray must be initialized before calling auto_infer_executor_config. "
|
|
281
|
+
"Call ray.init() first."
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Get per-node resources using ray.nodes() for precise calculation
|
|
285
|
+
nodes = ray.nodes()
|
|
286
|
+
|
|
287
|
+
head_cpus = 0
|
|
288
|
+
head_memory_gb = 0
|
|
289
|
+
worker_nodes: List[Dict[str, int]] = []
|
|
290
|
+
|
|
291
|
+
for node in nodes:
|
|
292
|
+
if not node.get("Alive", False):
|
|
293
|
+
continue
|
|
294
|
+
node_resources = node.get("Resources", {})
|
|
295
|
+
node_cpus = int(node_resources.get("CPU", 0))
|
|
296
|
+
node_memory_gb = int(node_resources.get("memory", 0) / (1024 * 1024 * 1024))
|
|
297
|
+
|
|
298
|
+
# Head node has 'node:__internal_head__' resource
|
|
299
|
+
if "node:__internal_head__" in node_resources:
|
|
300
|
+
head_cpus = node_cpus
|
|
301
|
+
head_memory_gb = node_memory_gb
|
|
302
|
+
logger.debug(f"Head node: {node_cpus} CPUs, {node_memory_gb}GB memory")
|
|
303
|
+
else:
|
|
304
|
+
worker_nodes.append({"cpus": node_cpus, "memory_gb": node_memory_gb})
|
|
305
|
+
logger.debug(f"Worker node: {node_cpus} CPUs, {node_memory_gb}GB memory")
|
|
306
|
+
|
|
307
|
+
num_workers = len(worker_nodes)
|
|
308
|
+
if num_workers == 0:
|
|
309
|
+
# Fallback: treat all resources as single executor (local mode)
|
|
310
|
+
total_resources = ray.cluster_resources()
|
|
311
|
+
executor_cores = max(
|
|
312
|
+
int(total_resources.get("CPU", 4)) - cpu_overhead_per_executor,
|
|
313
|
+
min_executor_cores,
|
|
314
|
+
)
|
|
315
|
+
executor_memory_gb = max(
|
|
316
|
+
int(total_resources.get("memory", 8 * 1024**3) / 1024**3)
|
|
317
|
+
- memory_overhead_gb_per_executor,
|
|
318
|
+
min_executor_memory_gb,
|
|
319
|
+
)
|
|
320
|
+
num_executors = 1
|
|
321
|
+
driver_memory_gb = min_driver_memory_gb
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Local mode detected: 1 executor with {executor_cores} cores, "
|
|
324
|
+
f"{executor_memory_gb}GB memory"
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
# Use minimum worker resources to ensure all executors can be scheduled
|
|
328
|
+
min_worker_cpus = min(w["cpus"] for w in worker_nodes)
|
|
329
|
+
min_worker_memory_gb = min(w["memory_gb"] for w in worker_nodes)
|
|
330
|
+
|
|
331
|
+
# Executor config: leave overhead for system processes per worker
|
|
332
|
+
executor_cores = max(
|
|
333
|
+
min_worker_cpus - cpu_overhead_per_executor,
|
|
334
|
+
min_executor_cores,
|
|
335
|
+
)
|
|
336
|
+
executor_memory_gb = max(
|
|
337
|
+
min_worker_memory_gb - memory_overhead_gb_per_executor,
|
|
338
|
+
min_executor_memory_gb,
|
|
339
|
+
)
|
|
340
|
+
num_executors = num_workers
|
|
341
|
+
|
|
342
|
+
# Driver on head: use half of head memory
|
|
343
|
+
driver_memory_gb = max(head_memory_gb // 2, min_driver_memory_gb)
|
|
344
|
+
|
|
345
|
+
logger.info(
|
|
346
|
+
f"Cluster: {num_workers} workers, head={head_cpus}CPU/{head_memory_gb}GB"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
logger.info(
|
|
350
|
+
f"Auto-configured: {num_executors} executors, {executor_cores} cores each, "
|
|
351
|
+
f"{executor_memory_gb}g memory, driver {driver_memory_gb}g"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
return ExecutorConfig(
|
|
355
|
+
num_executors=num_executors,
|
|
356
|
+
executor_cores=executor_cores,
|
|
357
|
+
executor_memory_gb=executor_memory_gb,
|
|
358
|
+
driver_memory_gb=driver_memory_gb,
|
|
359
|
+
)
|