PyPI - arbor-ai - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

arbor-ai 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arbor/server/services/comms/comms.py CHANGED Viewed

@@ -49,10 +49,15 @@ class ArborServerCommsHandler:
             yield status
     def close(self):
+        self.command_socket.setsockopt(zmq.LINGER, 0)
         self.command_socket.close()
+        self.status_socket.setsockopt(zmq.LINGER, 0)
         self.status_socket.close()
+        self.data_socket.setsockopt(zmq.LINGER, 0)
         self.data_socket.close()
+        self.broadcast_socket.setsockopt(zmq.LINGER, 0)
         self.broadcast_socket.close()
+        self.handshake_socket.setsockopt(zmq.LINGER, 0)
         self.handshake_socket.close()
         self.context.term()

arbor/server/services/grpo_manager.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 import random
 import signal
+import socket
 import string
 import subprocess
 import sys
@@ -120,12 +121,17 @@ class GRPOManager:
         num_processes = self.settings.arbor_config.training.gpu_ids.count(",") + 1
+        # This is the port for the accelerate main process
+        main_process_port = get_free_port()
         params = [
             "python",
             "-m",
             "accelerate.commands.launch",
             "--num_processes",
             str(num_processes),
+            "--main_process_port",
+            str(main_process_port),
         ]
         if self.settings.arbor_config.training.accelerate_config:
             params.extend(
@@ -276,7 +282,9 @@ class GRPOManager:
                 inference_manager.kill()
             # Send termination command through REQ socket
-            self.server_comms_handler.send_broadcast({"message": "terminate"})
+            # self.server_comms_handler.send_broadcast({"message": "terminate"})
+            self.training_process.terminate()
+            print("Waiting for training process to finish")
             # Wait for training process to finish
             if self.training_process:
@@ -289,6 +297,21 @@ class GRPOManager:
             if self.server_comms_handler:
                 self.server_comms_handler.close()
+            # Force kill training process if still running
+            if self.training_process and self.training_process.poll() is None:
+                self.training_process.kill()
+                self.training_process.wait()
+            # Reinitialize incase we want to start a new training run
+            self.training_process = None
+            self.current_model = None
+            self.server_comms_handler = None
+            self.status_thread = None
+            self.model_saved_and_reload_requested = False
+            self.data_count = 0
+            self.last_inference_update = 0
             if self.train_kwargs and "output_dir" in self.train_kwargs:
                 print(
                     f"Training completed. Model saved to {self.train_kwargs['output_dir']}"
@@ -297,9 +320,12 @@ class GRPOManager:
                     print(
                         f"Warning: Output directory {self.train_kwargs['output_dir']} does not exist"
                     )
-                return self.train_kwargs["output_dir"]
+                output_dir = self.train_kwargs["output_dir"]
+                self.train_kwargs = None
+                return output_dir
             else:
                 print("Training terminated, no output directory specified")
+                self.train_kwargs = None
                 return None
     def _should_update_model(self):
@@ -308,3 +334,12 @@ class GRPOManager:
         #     >= self.train_kwargs["update_interval"]
         # )
         return self.model_saved_and_reload_requested
+def get_free_port() -> int:
+    """
+    Return a free TCP port on localhost.
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("localhost", 0))
+        return s.getsockname()[1]

arbor/server/services/inference_manager.py CHANGED Viewed

@@ -61,7 +61,7 @@ class InferenceManager:
         my_env = os.environ.copy()
         my_env["CUDA_VISIBLE_DEVICES"] = self.settings.arbor_config.inference.gpu_ids
         n_gpus = self.settings.arbor_config.inference.gpu_ids.count(",") + 1
-        # command = f"vllm serve {model} --port {port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --max_model_len 8192 --enable_prefix_caching --guided-decoding-backend xgrammar"
+        # command = f"vllm serve {model} --port {port} --gpu-memory-utilization 0.9 --tensor-parallel-size {n_gpus} --max_model_len 8192 --enable_prefix_caching"
         command = f"python -m sglang_router.launch_server --model-path {model} --dp-size {n_gpus} --router-policy round_robin --port {port} --host 0.0.0.0"
         print(f"Running command: {command}")
@@ -137,8 +137,6 @@ class InferenceManager:
         process = self.process
         thread = self.thread
-        terminate_process(process)
         # Clear references first
         self.process = None
         self.thread = None

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arbor-ai
-Version: 0.1.8
+Version: 0.1.10
 Summary: A framework for fine-tuning and managing language models
 Author-email: Noah Ziems <nziems2@nd.edu>
 Project-URL: Homepage, https://github.com/Ziems/arbor
@@ -57,6 +57,7 @@ inference:
 training:
   gpu_ids: '1, 2'
 ```
+Which will use the `GPU:0` for inference with `GPU:1` and `GPU:2` reserved for training. We generally recommend splitting the GPUs roughly evenly between inference and training.
 ### 2️⃣ Start the Server

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/RECORD RENAMED Viewed

@@ -17,18 +17,18 @@ arbor/server/core/logging.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 arbor/server/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arbor/server/services/dependencies.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arbor/server/services/file_manager.py,sha256=Z9z4A4EzvPauid_DBfpim401DDtuJy_TbX4twTWDJWI,12119
-arbor/server/services/grpo_manager.py,sha256=T-f1TrNSi_kmxPOcpaDphS8Xf3UMUbricocc6fuaKIM,12077
-arbor/server/services/inference_manager.py,sha256=qR9xPiYs4Is24vgeF72w7Hbe8j_PGEbl-qewcvUV-dA,9731
+arbor/server/services/grpo_manager.py,sha256=50g90lV8qpol7fQp2SBTXUCrF5eOP8YdxDnMLM0XY0E,13311
+arbor/server/services/inference_manager.py,sha256=gHI-Biy3TtGkyWxIDKY-uqZZm_fiQJLktkPY8ezRvo8,9660
 arbor/server/services/job_manager.py,sha256=m_d4UPwN_82f7t7K443DaFpFoyv7JZSZKml8tawt1Bk,2186
 arbor/server/services/training_manager.py,sha256=oQdhpfxdgp_lCTb_lxhvjupdLrcg6HL3TEbct_q9F6I,21065
 arbor/server/services/comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arbor/server/services/comms/comms.py,sha256=Dg08D2Fm5TAEiGyr0Qcr0uocabQpFD_sBVhxIkj9D2M,7424
+arbor/server/services/comms/comms.py,sha256=3KN3mzwPvfW2_L5hq02JdAk6yOMyhY0_pBz-DDr5A3o,7694
 arbor/server/services/scripts/grpo_training.py,sha256=V36pCMZDJj2DdzquxScOddi9zP8EVPGWN3HGiftFfrY,21082
 arbor/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arbor/server/utils/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arbor_ai-0.1.8.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
-arbor_ai-0.1.8.dist-info/METADATA,sha256=kAZBj176hfqSrvrcWb0Wz8_vU33yiZJ-ck9buyDF6Jg,2234
-arbor_ai-0.1.8.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
-arbor_ai-0.1.8.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
-arbor_ai-0.1.8.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
-arbor_ai-0.1.8.dist-info/RECORD,,
+arbor_ai-0.1.10.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
+arbor_ai-0.1.10.dist-info/METADATA,sha256=qnUBfdKczxenG5kPTcZgQVMnWimEUPExz7nONxBYpDQ,2413
+arbor_ai-0.1.10.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
+arbor_ai-0.1.10.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
+arbor_ai-0.1.10.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
+arbor_ai-0.1.10.dist-info/RECORD,,

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arbor_ai-0.1.8.dist-info → arbor_ai-0.1.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

arbor-ai 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

arbor-ai 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl