arbor-ai 0.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -270,7 +270,6 @@ class GRPOManager:
270
270
  print("Updating inference model...")
271
271
  # There is a case where this status is sent multiple times
272
272
  # We need to make sure we only update the model once
273
- self.current_model = status["output_dir"]
274
273
  self.saving_model = False
275
274
  print("Model update complete")
276
275
  elif status["status"] == "checkpoint_saved":
@@ -308,6 +307,9 @@ class GRPOManager:
308
307
  print(f"Failed to send batch to training process: {e}")
309
308
  raise
310
309
 
310
+ self.current_model = self.train_kwargs["output_dir"]
311
+ inference_manager.launched_model = self.current_model
312
+
311
313
  return {
312
314
  "current_model": self.current_model,
313
315
  "checkpoints": self.checkpoints,
@@ -1,5 +1,6 @@
1
1
  # adapted from Will Brown's verifiers library (https://github.com/willccbb/verifiers)
2
2
 
3
+ import asyncio
3
4
  import atexit
4
5
  import logging
5
6
  import time
@@ -239,7 +240,7 @@ class VLLMClient:
239
240
  response.raise_for_status()
240
241
  return response.json()
241
242
 
242
- except httpx.TimeoutError:
243
+ except httpx.TimeoutException:
243
244
  logger.error("Request timed out")
244
245
  raise
245
246
  except InferenceBlockedError:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arbor-ai
3
- Version: 0.2
3
+ Version: 0.2.1
4
4
  Summary: A framework for fine-tuning and managing language models
5
5
  Author-email: Noah Ziems <nziems2@nd.edu>
6
6
  Project-URL: Homepage, https://github.com/Ziems/arbor
@@ -79,6 +79,16 @@ Follow the DSPy tutorials here to see usage examples:
79
79
 
80
80
  ---
81
81
 
82
+ ### Troubleshooting
83
+
84
+ **NCCL Errors**
85
+ Certain GPU setups, particularly with newer GPUs, seem to have issues with NCCL that cause Arbor to crash. Often times of these can be fixed with the following environment variables:
86
+
87
+ ```bash
88
+ export NCCL_P2P_DISABLE=1
89
+ export NCCL_IB_DISABLE=1
90
+ ```
91
+
82
92
  ## 🙏 Acknowledgements
83
93
 
84
94
  Arbor builds on the shoulders of great work. We extend our thanks to:
@@ -17,14 +17,14 @@ arbor/server/core/logging.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
17
17
  arbor/server/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  arbor/server/services/dependencies.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  arbor/server/services/file_manager.py,sha256=Z9z4A4EzvPauid_DBfpim401DDtuJy_TbX4twTWDJWI,12119
20
- arbor/server/services/grpo_manager.py,sha256=MDpOGN99WnNg4q_8974MkAnqcClOXy6fYcD2sFvs2Ho,18487
20
+ arbor/server/services/grpo_manager.py,sha256=jY4kc7wlKKoi7RigjJiH1VaxX6qJCOxyEc0oYCkqPlQ,18549
21
21
  arbor/server/services/inference_manager.py,sha256=a1c5zYbjk6fPM3egX2McKv7ZWPN7c-QH_Qogu9iay90,9597
22
22
  arbor/server/services/job_manager.py,sha256=m_d4UPwN_82f7t7K443DaFpFoyv7JZSZKml8tawt1Bk,2186
23
23
  arbor/server/services/training_manager.py,sha256=oQdhpfxdgp_lCTb_lxhvjupdLrcg6HL3TEbct_q9F6I,21065
24
24
  arbor/server/services/comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  arbor/server/services/comms/comms.py,sha256=3KN3mzwPvfW2_L5hq02JdAk6yOMyhY0_pBz-DDr5A3o,7694
26
26
  arbor/server/services/inference/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- arbor/server/services/inference/vllm_client.py,sha256=P4etwX47VVMEaVWUOT-aP6_OONf8ZzniwXndmJujNxY,18250
27
+ arbor/server/services/inference/vllm_client.py,sha256=06-VfdcwKqq8_ZRWaER3OnSVLtvL87bLdljSrkXfm-A,18269
28
28
  arbor/server/services/inference/vllm_serve.py,sha256=UZAGo7CyshR3-9fhXCTKhXeidqNqbY6LyU9DDNiX_Sw,109543
29
29
  arbor/server/services/scripts/dpo_training.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  arbor/server/services/scripts/grpo_training.py,sha256=6kXzMwn3rZXHdEn0xe_Kd9d7tbdYb76zE0zbi02xCm4,31314
@@ -34,9 +34,9 @@ arbor/server/services/scripts/utils/arg_parser.py,sha256=ur_iyhc_Ie00tjq63vK4Sde
34
34
  arbor/server/services/scripts/utils/dataset.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  arbor/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  arbor/server/utils/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- arbor_ai-0.2.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
38
- arbor_ai-0.2.dist-info/METADATA,sha256=LieUwdo2RQBgh5ukQJh-NHUA2_CBS1Dr9YqjSbgcEnM,2504
39
- arbor_ai-0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- arbor_ai-0.2.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
41
- arbor_ai-0.2.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
42
- arbor_ai-0.2.dist-info/RECORD,,
37
+ arbor_ai-0.2.1.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
38
+ arbor_ai-0.2.1.dist-info/METADATA,sha256=34XAZBm8OLlsSBicLmRn_hhbltn0pDNlAj5WOjn9LtE,2791
39
+ arbor_ai-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ arbor_ai-0.2.1.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
41
+ arbor_ai-0.2.1.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
42
+ arbor_ai-0.2.1.dist-info/RECORD,,