arbor-ai 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbor/server/services/comms/comms.py +5 -0
- arbor/server/services/grpo_manager.py +22 -2
- arbor/server/services/inference_manager.py +1 -3
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/METADATA +1 -1
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/RECORD +9 -9
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/WHEEL +0 -0
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/entry_points.txt +0 -0
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {arbor_ai-0.1.8.dist-info → arbor_ai-0.1.9.dist-info}/top_level.txt +0 -0
@@ -49,10 +49,15 @@ class ArborServerCommsHandler:
|
|
49
49
|
yield status
|
50
50
|
|
51
51
|
def close(self):
|
52
|
+
self.command_socket.setsockopt(zmq.LINGER, 0)
|
52
53
|
self.command_socket.close()
|
54
|
+
self.status_socket.setsockopt(zmq.LINGER, 0)
|
53
55
|
self.status_socket.close()
|
56
|
+
self.data_socket.setsockopt(zmq.LINGER, 0)
|
54
57
|
self.data_socket.close()
|
58
|
+
self.broadcast_socket.setsockopt(zmq.LINGER, 0)
|
55
59
|
self.broadcast_socket.close()
|
60
|
+
self.handshake_socket.setsockopt(zmq.LINGER, 0)
|
56
61
|
self.handshake_socket.close()
|
57
62
|
self.context.term()
|
58
63
|
|
@@ -276,7 +276,9 @@ class GRPOManager:
|
|
276
276
|
inference_manager.kill()
|
277
277
|
|
278
278
|
# Send termination command through REQ socket
|
279
|
-
self.server_comms_handler.send_broadcast({"message": "terminate"})
|
279
|
+
# self.server_comms_handler.send_broadcast({"message": "terminate"})
|
280
|
+
self.training_process.terminate()
|
281
|
+
print("Waiting for training process to finish")
|
280
282
|
|
281
283
|
# Wait for training process to finish
|
282
284
|
if self.training_process:
|
@@ -289,6 +291,21 @@ class GRPOManager:
|
|
289
291
|
if self.server_comms_handler:
|
290
292
|
self.server_comms_handler.close()
|
291
293
|
|
294
|
+
# Force kill training process if still running
|
295
|
+
if self.training_process and self.training_process.poll() is None:
|
296
|
+
self.training_process.kill()
|
297
|
+
self.training_process.wait()
|
298
|
+
|
299
|
+
# Reinitialize incase we want to start a new training run
|
300
|
+
self.training_process = None
|
301
|
+
self.current_model = None
|
302
|
+
self.server_comms_handler = None
|
303
|
+
self.status_thread = None
|
304
|
+
self.model_saved_and_reload_requested = False
|
305
|
+
|
306
|
+
self.data_count = 0
|
307
|
+
self.last_inference_update = 0
|
308
|
+
|
292
309
|
if self.train_kwargs and "output_dir" in self.train_kwargs:
|
293
310
|
print(
|
294
311
|
f"Training completed. Model saved to {self.train_kwargs['output_dir']}"
|
@@ -297,9 +314,12 @@ class GRPOManager:
|
|
297
314
|
print(
|
298
315
|
f"Warning: Output directory {self.train_kwargs['output_dir']} does not exist"
|
299
316
|
)
|
300
|
-
|
317
|
+
output_dir = self.train_kwargs["output_dir"]
|
318
|
+
self.train_kwargs = None
|
319
|
+
return output_dir
|
301
320
|
else:
|
302
321
|
print("Training terminated, no output directory specified")
|
322
|
+
self.train_kwargs = None
|
303
323
|
return None
|
304
324
|
|
305
325
|
def _should_update_model(self):
|
@@ -61,7 +61,7 @@ class InferenceManager:
|
|
61
61
|
my_env = os.environ.copy()
|
62
62
|
my_env["CUDA_VISIBLE_DEVICES"] = self.settings.arbor_config.inference.gpu_ids
|
63
63
|
n_gpus = self.settings.arbor_config.inference.gpu_ids.count(",") + 1
|
64
|
-
# command = f"vllm serve {model} --port {port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --max_model_len 8192 --enable_prefix_caching
|
64
|
+
# command = f"vllm serve {model} --port {port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --max_model_len 8192 --enable_prefix_caching"
|
65
65
|
command = f"python -m sglang_router.launch_server --model-path {model} --dp-size {n_gpus} --router-policy round_robin --port {port} --host 0.0.0.0"
|
66
66
|
print(f"Running command: {command}")
|
67
67
|
|
@@ -137,8 +137,6 @@ class InferenceManager:
|
|
137
137
|
process = self.process
|
138
138
|
thread = self.thread
|
139
139
|
|
140
|
-
terminate_process(process)
|
141
|
-
|
142
140
|
# Clear references first
|
143
141
|
self.process = None
|
144
142
|
self.thread = None
|
@@ -17,18 +17,18 @@ arbor/server/core/logging.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
17
17
|
arbor/server/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
arbor/server/services/dependencies.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
arbor/server/services/file_manager.py,sha256=Z9z4A4EzvPauid_DBfpim401DDtuJy_TbX4twTWDJWI,12119
|
20
|
-
arbor/server/services/grpo_manager.py,sha256=
|
21
|
-
arbor/server/services/inference_manager.py,sha256=
|
20
|
+
arbor/server/services/grpo_manager.py,sha256=Nd7T0q1RTQUhhG4uMB8lUonw_6Rww31yVJo8MM-noU8,12903
|
21
|
+
arbor/server/services/inference_manager.py,sha256=gHI-Biy3TtGkyWxIDKY-uqZZm_fiQJLktkPY8ezRvo8,9660
|
22
22
|
arbor/server/services/job_manager.py,sha256=m_d4UPwN_82f7t7K443DaFpFoyv7JZSZKml8tawt1Bk,2186
|
23
23
|
arbor/server/services/training_manager.py,sha256=oQdhpfxdgp_lCTb_lxhvjupdLrcg6HL3TEbct_q9F6I,21065
|
24
24
|
arbor/server/services/comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
-
arbor/server/services/comms/comms.py,sha256=
|
25
|
+
arbor/server/services/comms/comms.py,sha256=3KN3mzwPvfW2_L5hq02JdAk6yOMyhY0_pBz-DDr5A3o,7694
|
26
26
|
arbor/server/services/scripts/grpo_training.py,sha256=V36pCMZDJj2DdzquxScOddi9zP8EVPGWN3HGiftFfrY,21082
|
27
27
|
arbor/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
arbor/server/utils/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
29
|
-
arbor_ai-0.1.
|
30
|
-
arbor_ai-0.1.
|
31
|
-
arbor_ai-0.1.
|
32
|
-
arbor_ai-0.1.
|
33
|
-
arbor_ai-0.1.
|
34
|
-
arbor_ai-0.1.
|
29
|
+
arbor_ai-0.1.9.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
|
30
|
+
arbor_ai-0.1.9.dist-info/METADATA,sha256=GVTgqz290CXbwW5Yyt-EpelGmp1UZuK8kpN67Fp2bV0,2234
|
31
|
+
arbor_ai-0.1.9.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
32
|
+
arbor_ai-0.1.9.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
|
33
|
+
arbor_ai-0.1.9.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
|
34
|
+
arbor_ai-0.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|