arbor-ai 0.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbor/server/services/grpo_manager.py +3 -1
- arbor/server/services/inference/vllm_client.py +2 -1
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/METADATA +11 -1
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/RECORD +8 -8
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/WHEEL +0 -0
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/entry_points.txt +0 -0
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {arbor_ai-0.2.dist-info → arbor_ai-0.2.1.dist-info}/top_level.txt +0 -0
@@ -270,7 +270,6 @@ class GRPOManager:
|
|
270
270
|
print("Updating inference model...")
|
271
271
|
# There is a case where this status is sent multiple times
|
272
272
|
# We need to make sure we only update the model once
|
273
|
-
self.current_model = status["output_dir"]
|
274
273
|
self.saving_model = False
|
275
274
|
print("Model update complete")
|
276
275
|
elif status["status"] == "checkpoint_saved":
|
@@ -308,6 +307,9 @@ class GRPOManager:
|
|
308
307
|
print(f"Failed to send batch to training process: {e}")
|
309
308
|
raise
|
310
309
|
|
310
|
+
self.current_model = self.train_kwargs["output_dir"]
|
311
|
+
inference_manager.launched_model = self.current_model
|
312
|
+
|
311
313
|
return {
|
312
314
|
"current_model": self.current_model,
|
313
315
|
"checkpoints": self.checkpoints,
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# adapted from Will Brown's verifiers library (https://github.com/willccbb/verifiers)
|
2
2
|
|
3
|
+
import asyncio
|
3
4
|
import atexit
|
4
5
|
import logging
|
5
6
|
import time
|
@@ -239,7 +240,7 @@ class VLLMClient:
|
|
239
240
|
response.raise_for_status()
|
240
241
|
return response.json()
|
241
242
|
|
242
|
-
except httpx.
|
243
|
+
except httpx.TimeoutException:
|
243
244
|
logger.error("Request timed out")
|
244
245
|
raise
|
245
246
|
except InferenceBlockedError:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: arbor-ai
|
3
|
-
Version: 0.2
|
3
|
+
Version: 0.2.1
|
4
4
|
Summary: A framework for fine-tuning and managing language models
|
5
5
|
Author-email: Noah Ziems <nziems2@nd.edu>
|
6
6
|
Project-URL: Homepage, https://github.com/Ziems/arbor
|
@@ -79,6 +79,16 @@ Follow the DSPy tutorials here to see usage examples:
|
|
79
79
|
|
80
80
|
---
|
81
81
|
|
82
|
+
### Troubleshooting
|
83
|
+
|
84
|
+
**NCCL Errors**
|
85
|
+
Certain GPU setups, particularly with newer GPUs, seem to have issues with NCCL that cause Arbor to crash. Often times of these can be fixed with the following environment variables:
|
86
|
+
|
87
|
+
```bash
|
88
|
+
export NCCL_P2P_DISABLE=1
|
89
|
+
export NCCL_IB_DISABLE=1
|
90
|
+
```
|
91
|
+
|
82
92
|
## 🙏 Acknowledgements
|
83
93
|
|
84
94
|
Arbor builds on the shoulders of great work. We extend our thanks to:
|
@@ -17,14 +17,14 @@ arbor/server/core/logging.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
17
17
|
arbor/server/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
arbor/server/services/dependencies.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
arbor/server/services/file_manager.py,sha256=Z9z4A4EzvPauid_DBfpim401DDtuJy_TbX4twTWDJWI,12119
|
20
|
-
arbor/server/services/grpo_manager.py,sha256=
|
20
|
+
arbor/server/services/grpo_manager.py,sha256=jY4kc7wlKKoi7RigjJiH1VaxX6qJCOxyEc0oYCkqPlQ,18549
|
21
21
|
arbor/server/services/inference_manager.py,sha256=a1c5zYbjk6fPM3egX2McKv7ZWPN7c-QH_Qogu9iay90,9597
|
22
22
|
arbor/server/services/job_manager.py,sha256=m_d4UPwN_82f7t7K443DaFpFoyv7JZSZKml8tawt1Bk,2186
|
23
23
|
arbor/server/services/training_manager.py,sha256=oQdhpfxdgp_lCTb_lxhvjupdLrcg6HL3TEbct_q9F6I,21065
|
24
24
|
arbor/server/services/comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
arbor/server/services/comms/comms.py,sha256=3KN3mzwPvfW2_L5hq02JdAk6yOMyhY0_pBz-DDr5A3o,7694
|
26
26
|
arbor/server/services/inference/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
arbor/server/services/inference/vllm_client.py,sha256=
|
27
|
+
arbor/server/services/inference/vllm_client.py,sha256=06-VfdcwKqq8_ZRWaER3OnSVLtvL87bLdljSrkXfm-A,18269
|
28
28
|
arbor/server/services/inference/vllm_serve.py,sha256=UZAGo7CyshR3-9fhXCTKhXeidqNqbY6LyU9DDNiX_Sw,109543
|
29
29
|
arbor/server/services/scripts/dpo_training.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
arbor/server/services/scripts/grpo_training.py,sha256=6kXzMwn3rZXHdEn0xe_Kd9d7tbdYb76zE0zbi02xCm4,31314
|
@@ -34,9 +34,9 @@ arbor/server/services/scripts/utils/arg_parser.py,sha256=ur_iyhc_Ie00tjq63vK4Sde
|
|
34
34
|
arbor/server/services/scripts/utils/dataset.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
arbor/server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
arbor/server/utils/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
-
arbor_ai-0.2.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
|
38
|
-
arbor_ai-0.2.dist-info/METADATA,sha256=
|
39
|
-
arbor_ai-0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
40
|
-
arbor_ai-0.2.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
|
41
|
-
arbor_ai-0.2.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
|
42
|
-
arbor_ai-0.2.dist-info/RECORD,,
|
37
|
+
arbor_ai-0.2.1.dist-info/licenses/LICENSE,sha256=5vFGrbOFeXXM83JV9o16w7ohH4WLeu3-57GocJSz8ow,1067
|
38
|
+
arbor_ai-0.2.1.dist-info/METADATA,sha256=34XAZBm8OLlsSBicLmRn_hhbltn0pDNlAj5WOjn9LtE,2791
|
39
|
+
arbor_ai-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
40
|
+
arbor_ai-0.2.1.dist-info/entry_points.txt,sha256=PGBX-MfNwfIl8UPFgsX3gjtXLqSogRhOktKMpZUysD0,40
|
41
|
+
arbor_ai-0.2.1.dist-info/top_level.txt,sha256=jzWdp3BRYqvZDMFsPajrcftvvlluzVDErkD8IMRfhYs,6
|
42
|
+
arbor_ai-0.2.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|