rapidfireai 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidfireai might be problematic. Click here for more details.
- rapidfireai/cli.py +25 -5
- rapidfireai/experiment.py +5 -1
- rapidfireai/start.sh +154 -141
- rapidfireai/utils/constants.py +1 -2
- rapidfireai/utils/ping.py +29 -0
- rapidfireai/utils/shm_manager.py +15 -11
- rapidfireai/version.py +2 -2
- {rapidfireai-0.9.9.dist-info → rapidfireai-0.9.11.dist-info}/METADATA +40 -11
- {rapidfireai-0.9.9.dist-info → rapidfireai-0.9.11.dist-info}/RECORD +19 -12
- rapidfireai-0.9.11.dist-info/entry_points.txt +2 -0
- tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +412 -0
- tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +427 -0
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +358 -0
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +371 -0
- tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +329 -0
- tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +331 -0
- rapidfireai-0.9.9.dist-info/entry_points.txt +0 -2
- {rapidfireai-0.9.9.dist-info → rapidfireai-0.9.11.dist-info}/WHEEL +0 -0
- {rapidfireai-0.9.9.dist-info → rapidfireai-0.9.11.dist-info}/licenses/LICENSE +0 -0
- {rapidfireai-0.9.9.dist-info → rapidfireai-0.9.11.dist-info}/top_level.txt +0 -0
rapidfireai/utils/constants.py
CHANGED
|
@@ -10,7 +10,6 @@ SHM_MIN_FREE_SPACE = 1.0
|
|
|
10
10
|
LOG_FILENAME = "rapidfire.log"
|
|
11
11
|
TRAINING_LOG_FILENAME = "training.log"
|
|
12
12
|
|
|
13
|
-
|
|
14
13
|
class LogType(Enum):
|
|
15
14
|
"""Enum class for log types"""
|
|
16
15
|
|
|
@@ -33,7 +32,7 @@ class DBConfig:
|
|
|
33
32
|
# Use user's home directory for database path
|
|
34
33
|
import os
|
|
35
34
|
|
|
36
|
-
DB_PATH: str = os.path.expanduser("
|
|
35
|
+
DB_PATH: str = os.path.join(os.getenv("RF_DB_PATH", os.path.expanduser(os.path.join("~","db"))), "rapidfire.db")
|
|
37
36
|
|
|
38
37
|
# Connection settings
|
|
39
38
|
CONNECTION_TIMEOUT: float = 30.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
import socket
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
def ping_server(server: str, port: int, timeout=3):
|
|
6
|
+
"""ping server:port """
|
|
7
|
+
try:
|
|
8
|
+
socket.setdefaulttimeout(timeout)
|
|
9
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
10
|
+
s.connect((server, port))
|
|
11
|
+
except OSError as _:
|
|
12
|
+
return False
|
|
13
|
+
else:
|
|
14
|
+
s.close()
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
parser = argparse.ArgumentParser(description="Ping a server port")
|
|
19
|
+
parser.add_argument("server", type=str, help="Server to ping")
|
|
20
|
+
parser.add_argument("port", type=int, help="Port to ping")
|
|
21
|
+
parser.add_argument("--timeout", "-t", type=int, help="Timeout in seconds", default=3)
|
|
22
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
ping_result = ping_server(args.server, args.port, args.timeout)
|
|
25
|
+
if args.verbose:
|
|
26
|
+
print(ping_result)
|
|
27
|
+
if ping_result:
|
|
28
|
+
exit(0)
|
|
29
|
+
exit(1)
|
rapidfireai/utils/shm_manager.py
CHANGED
|
@@ -286,7 +286,7 @@ class SharedMemoryManager:
|
|
|
286
286
|
# create model entry in registry
|
|
287
287
|
if model_id not in self._registry:
|
|
288
288
|
self._registry[model_id] = {SHMObjectType.CHECKPOINTS: {}}
|
|
289
|
-
|
|
289
|
+
|
|
290
290
|
model_entry = self._registry[model_id]
|
|
291
291
|
if SHMObjectType.CHECKPOINTS not in model_entry:
|
|
292
292
|
model_entry[SHMObjectType.CHECKPOINTS] = {}
|
|
@@ -445,18 +445,22 @@ class SharedMemoryManager:
|
|
|
445
445
|
SHMObjectType.CHECKPOINTS: {},
|
|
446
446
|
}
|
|
447
447
|
|
|
448
|
+
# copy full_model, ref_state_dict, and checkpoints from warm_started_from to model_id
|
|
448
449
|
model_entry = dict(self._registry[model_id])
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
450
|
+
if SHMObjectType.FULL_MODEL in self._registry[warm_started_from]:
|
|
451
|
+
model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
|
|
452
|
+
dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
|
|
453
|
+
)
|
|
454
|
+
if SHMObjectType.REF_STATE_DICT in self._registry[warm_started_from]:
|
|
455
|
+
model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
|
|
456
|
+
dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
|
|
457
|
+
)
|
|
458
|
+
if SHMObjectType.CHECKPOINTS in self._registry[warm_started_from]:
|
|
459
|
+
model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
|
|
460
|
+
dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
|
|
461
|
+
)
|
|
458
462
|
self._registry[model_id] = model_entry
|
|
459
|
-
self.logger.debug(f"Copied warm start checkpoint from {warm_started_from} to {model_id}")
|
|
463
|
+
self.logger.debug(f"Copied warm start checkpoint from run {warm_started_from} to run {model_id}")
|
|
460
464
|
|
|
461
465
|
def list_models(self):
|
|
462
466
|
"""Get list of all model IDs currently in shared memory."""
|
rapidfireai/version.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rapidfireai
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary: RapidFire AI
|
|
3
|
+
Version: 0.9.11
|
|
4
|
+
Summary: RapidFire AI: Rapid Experimentation Engine for Customizing LLMs
|
|
5
5
|
Author-email: "RapidFire AI Inc." <support@rapidfire.ai>
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Project-URL: Homepage, https://rapidfire.ai
|
|
8
|
-
Keywords: ai,rapidfire,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
|
|
8
|
+
Keywords: ai,rapidfire,rapidfireai,llm,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
|
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
|
10
10
|
Classifier: Intended Audience :: Developers
|
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -63,17 +63,20 @@ Rapid experimentation for easier, faster, and more impactful fine-tuning and pos
|
|
|
63
63
|
|
|
64
64
|
RapidFire AI is a new experiment execution framework that transforms your LLM customization experimentation from slow, sequential processes into rapid, intelligent workflows with hyperparallelized training, dynamic real-time experiment control, and automatic multi-GPU system orchestration.
|
|
65
65
|
|
|
66
|
-

|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
## Getting Started
|
|
70
|
+
|
|
70
71
|
### Prerequisites
|
|
72
|
+
|
|
71
73
|
- [NVIDIA GPU using the 7.x or 8.x Compute Capability](https://developer.nvidia.com/cuda-gpus)
|
|
72
74
|
- [NVIDIA CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
|
|
73
75
|
- [Python 3.12.x](https://www.python.org/downloads/)
|
|
74
76
|
- [PyTorch 2.7.1+](https://pytorch.org/get-started/previous-versions/) with corresponding forward compatible prebuilt CUDA binaries
|
|
75
77
|
|
|
76
78
|
### Installation/Starting
|
|
79
|
+
|
|
77
80
|
```bash
|
|
78
81
|
virtualenv -p python3 oss_venv
|
|
79
82
|
source oss_venv/bin/activate
|
|
@@ -82,18 +85,36 @@ source oss_venv/bin/activate
|
|
|
82
85
|
pip install rapidfireai
|
|
83
86
|
|
|
84
87
|
# install specific dependencies and initialize rapidfire
|
|
85
|
-
|
|
88
|
+
# Optionally set RF_TUTORIAL_PATH environment variable to sepecify
|
|
89
|
+
# alternate location for copying tutorial notebooks to
|
|
90
|
+
rapidfireai init
|
|
86
91
|
|
|
87
92
|
# start the rapidfire server
|
|
88
|
-
|
|
93
|
+
rapidfireai start
|
|
89
94
|
|
|
90
95
|
# open up example notebook and start experiment
|
|
91
96
|
```
|
|
92
97
|
|
|
98
|
+
### Running tutorial notebooks
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
source oss_venv/bin/activate
|
|
93
102
|
|
|
103
|
+
# from replace <your_token> with your hugging face token
|
|
104
|
+
# https://huggingface.co/docs/hub/en/security-tokens
|
|
105
|
+
pip install "huggingface-hub[cli]"
|
|
106
|
+
hf auth login --token <your_token>
|
|
107
|
+
|
|
108
|
+
# open up example notebook from ./tutorial_notebooks and start experiment
|
|
109
|
+
```
|
|
94
110
|
|
|
95
111
|
### Troubleshooting
|
|
96
112
|
|
|
113
|
+
For a quick system diagnostics report (Python env, relevant packages, GPU/CUDA, and key environment variables), run:
|
|
114
|
+
```bash
|
|
115
|
+
rapidfireai doctor
|
|
116
|
+
```
|
|
117
|
+
|
|
97
118
|
If you encounter port conflicts, you can kill existing processes:
|
|
98
119
|
```bash
|
|
99
120
|
lsof -t -i:5002 | xargs kill -9 # mlflow
|
|
@@ -105,13 +126,14 @@ lsof -t -i:3000 | xargs kill -9 # frontend server
|
|
|
105
126
|
|
|
106
127
|
Browse or reference the full documentation, example use case tutorials, all API details, dashboard details, and more [here](https://rapidfire-ai-oss-docs.readthedocs-hosted.com/).
|
|
107
128
|
|
|
108
|
-
|
|
109
129
|
## Key Features
|
|
110
130
|
|
|
111
131
|
### MLflow Integration
|
|
132
|
+
|
|
112
133
|
Full MLflow support for experiment tracking and metrics visualization. A named RapidFire AI experiment corresponds to an MLflow experiment for comprehensive governance
|
|
113
134
|
|
|
114
135
|
### Interactive Control Operations (IC Ops)
|
|
136
|
+
|
|
115
137
|
First-of-its-kind dynamic real-time control over runs in flight. Can be invoked through the dashboard:
|
|
116
138
|
- Stop active runs; puts them in a dormant state
|
|
117
139
|
- Resume stopped runs; makes them active again
|
|
@@ -119,11 +141,12 @@ First-of-its-kind dynamic real-time control over runs in flight. Can be invoked
|
|
|
119
141
|
- Delete unwanted or failed runs
|
|
120
142
|
|
|
121
143
|
### Multi-GPU Support
|
|
144
|
+
|
|
122
145
|
The Scheduler automatically handles multiple GPUs on the machine and divides resources across all running configs for optimal resource utilization.
|
|
123
146
|
|
|
124
147
|
### Search and AutoML Support
|
|
125
|
-
Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
|
|
126
148
|
|
|
149
|
+
Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
|
|
127
150
|
|
|
128
151
|
## Directory Structure
|
|
129
152
|
|
|
@@ -142,6 +165,7 @@ rapidfireai/
|
|
|
142
165
|
## Architecture
|
|
143
166
|
|
|
144
167
|
RapidFire AI adopts a microservices-inspired loosely coupled distributed architecture with:
|
|
168
|
+
|
|
145
169
|
- **Dispatcher**: Web API layer for UI communication
|
|
146
170
|
- **Database**: SQLite for state persistence
|
|
147
171
|
- **Controller**: Central orchestrator running in user process
|
|
@@ -150,30 +174,36 @@ RapidFire AI adopts a microservices-inspired loosely coupled distributed archite
|
|
|
150
174
|
|
|
151
175
|
This design enables efficient resource utilization while providing a seamless user experience for AI experimentation.
|
|
152
176
|
|
|
153
|
-
|
|
154
177
|
## Components
|
|
155
178
|
|
|
156
179
|
### Dispatcher
|
|
180
|
+
|
|
157
181
|
The dispatcher provides a REST API interface for the web UI. It can be run via Flask as a single app or via Gunicorn to have it load balanced. Handles interactive control features and displays the current state of the runs in the experiment.
|
|
158
182
|
|
|
159
183
|
### Database
|
|
184
|
+
|
|
160
185
|
Uses SQLite for persistent storage of metadata of experiments, runs, and artifacts. The Controller also uses it to talk with Workers on scheduling state. A clean asynchronous interface for all DB operations, including experiment lifecycle management and run tracking.
|
|
161
186
|
|
|
162
187
|
### Controller
|
|
188
|
+
|
|
163
189
|
Runs as part of the user’s console or Notebook process. Orchestrates the entire training lifecycle including model creation, worker management, and scheduling. The `run_fit` logic handles sample preprocessing, model creation for given knob configurations, worker initialization, and continuous monitoring of training progress across distributed workers.
|
|
164
190
|
|
|
165
191
|
### Worker
|
|
192
|
+
|
|
166
193
|
Handles the actual model training and inference on the GPUs. Workers poll the Database for tasks, load dataset chunks, and execute training runs with checkpointing and progress reporting. Currently expects any given model for given batch size to fit on a single GPU.
|
|
167
194
|
|
|
168
195
|
### Experiment
|
|
196
|
+
|
|
169
197
|
Manages the complete experiment lifecycle, including creation, naming conventions, and cleanup. Experiments are automatically named with unique suffixes if conflicts exist, and all experiment metadata is tracked in the Database. An experiment's running tasks are automatically cancelled when the process ends abruptly.
|
|
170
198
|
|
|
171
199
|
### Dashboard
|
|
172
|
-
A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
|
|
173
200
|
|
|
201
|
+
A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
|
|
174
202
|
|
|
175
203
|
## Developing with RapidFire AI
|
|
204
|
+
|
|
176
205
|
### Prerequisites
|
|
206
|
+
|
|
177
207
|
- Python 3.x
|
|
178
208
|
- Git
|
|
179
209
|
- Ubuntu/Debian system (for apt package manager)
|
|
@@ -239,4 +269,3 @@ lsof -t -i:8080 | xargs kill -9 # dispatcher
|
|
|
239
269
|
lsof -t -i:5002 | xargs kill -9 # mlflow
|
|
240
270
|
lsof -t -i:3000 | xargs kill -9 # frontend
|
|
241
271
|
```
|
|
242
|
-
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
rapidfireai/__init__.py,sha256=mSV8CiaJ9LwjCpMdHSBd9bM-JBijDx-lc8hGny1KEsQ,368
|
|
2
|
-
rapidfireai/cli.py,sha256=
|
|
3
|
-
rapidfireai/experiment.py,sha256=
|
|
4
|
-
rapidfireai/start.sh,sha256=
|
|
5
|
-
rapidfireai/version.py,sha256=
|
|
2
|
+
rapidfireai/cli.py,sha256=AFaTAhLDbN5jf3o8EWtAmtChQCec4svAihKy-RdKZfk,15226
|
|
3
|
+
rapidfireai/experiment.py,sha256=YvUAzwM3o-gEYECDOmDhbB9szKthfMAw6aiAvJ91gYA,6962
|
|
4
|
+
rapidfireai/start.sh,sha256=o0Mp2EMrZ3lHkTdyCfOqg4JvatN220Kq2VuZNfhFFGg,21268
|
|
5
|
+
rapidfireai/version.py,sha256=nbW2oD0HrwXmHwyIIoFZX7CSAxAW5RLgcRirqDTBqvI,99
|
|
6
6
|
rapidfireai/automl/__init__.py,sha256=QnzWa33i9aMp1NatoQYJFPrGZchtTUAPkgSOyyDXbSU,501
|
|
7
7
|
rapidfireai/automl/base.py,sha256=pF6NQMr8DeEFm4PBbmbUbNAtP0S-yDfeUnKMqz2D9Zk,1947
|
|
8
8
|
rapidfireai/automl/datatypes.py,sha256=rbocXidGekpeukKQuMSZLFK6h6h4PIo1Fvre2FWmhqU,1470
|
|
@@ -300,19 +300,26 @@ rapidfireai/ml/checkpoint_utils.py,sha256=L6xMkaFD4onWVP_TJhymYgPI0LrC_TuLgFjoCk
|
|
|
300
300
|
rapidfireai/ml/trainer.py,sha256=5AMHgS7ZrC0x_K49TedQxQEzRBoGUk81DkJj0Csh4CI,12799
|
|
301
301
|
rapidfireai/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
302
302
|
rapidfireai/utils/automl_utils.py,sha256=4IeGZyYRxSdoKk1dBcTI5_JRms70TyiWL9F6Gta31BI,2004
|
|
303
|
-
rapidfireai/utils/constants.py,sha256=
|
|
303
|
+
rapidfireai/utils/constants.py,sha256=H2LpiQuJqC58I0P7_J53FmxKEkWWRByLq_-hoQNi30E,3305
|
|
304
304
|
rapidfireai/utils/datapaths.py,sha256=PKgZu_qWx2z6QBIfmzmjY0lWG79GaU6W3577_34yX10,2554
|
|
305
305
|
rapidfireai/utils/exceptions.py,sha256=RA6kMSV3nCz3oE-yhuNLDEneDqTUrZC6N0AkSRBdAlg,2002
|
|
306
306
|
rapidfireai/utils/experiment_utils.py,sha256=7ow1RGk4dnXOKVnkjcHNSYGjLLlVgPlrvnjt_hq_0Ik,14688
|
|
307
307
|
rapidfireai/utils/logging.py,sha256=X6hLKk4alVUhPqs4CdBmPj4ppSOkQ0WoyczNzCWs02E,3050
|
|
308
308
|
rapidfireai/utils/mlflow_manager.py,sha256=iGuA5ubmhTjhxtZrLCsStpCHBAidnnvONb5LVWZv-RE,5046
|
|
309
|
+
rapidfireai/utils/ping.py,sha256=d8d5Ykx-Tn0HRFeo3xzxwc__KMn2t9FvEd7ur9YLts8,976
|
|
309
310
|
rapidfireai/utils/serialize.py,sha256=_A9egs2uhlYNGT3Ntv2fzH7rwp6I-GGVoS4ViY3sufU,401
|
|
310
|
-
rapidfireai/utils/shm_manager.py,sha256=
|
|
311
|
+
rapidfireai/utils/shm_manager.py,sha256=OU-EEKMylW-q-oldh5KDmW770gz7yjYvhCw-_IRwquQ,21848
|
|
311
312
|
rapidfireai/utils/trainer_config.py,sha256=91X4-Z8aZl7W-W6Yf-wQINeFPFIf0gvzKT6Z3mfgYXA,587
|
|
312
313
|
rapidfireai/utils/worker_manager.py,sha256=LsXnXC2yDwnIp7tm1shpI6DMpif6XGtZ-4kDoo302tk,7971
|
|
313
|
-
rapidfireai-0.9.
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
314
|
+
rapidfireai-0.9.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
315
|
+
tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb,sha256=d3cQ8o0myJC7gyAZDZti9FmCEBpQ49BPbFpGMq-U7lY,13241
|
|
316
|
+
tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb,sha256=3pf7tjYHrmdL1O06nsjI7V_T7LP_AH_Qcgvj5ykv6yE,13854
|
|
317
|
+
tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb,sha256=cl1oxroOLIiVv8yFWGrYqmhKgE7RIBUg7EZCgiv9XG8,11576
|
|
318
|
+
tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb,sha256=Kdeoadw5lrTMQF9Zn42kYhldvQdnD1VLXWgI94Rq8So,12455
|
|
319
|
+
tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb,sha256=v7ITbSqYJgDKFzXJ5Mz4PdQFNCayDFvW6y0CFgao10Y,10468
|
|
320
|
+
tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb,sha256=edaOoWJtQf19zZKL0DEw9QynFvgvP0842Lwsw5cDQ9E,10343
|
|
321
|
+
rapidfireai-0.9.11.dist-info/METADATA,sha256=VKKhbyOtnIEbNMXT7Zz-CNKJOUj6msEXh7Za_I1EuVs,10557
|
|
322
|
+
rapidfireai-0.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
323
|
+
rapidfireai-0.9.11.dist-info/entry_points.txt,sha256=tuZF1oC4KyQ9H767o83S8Y-ZiGvw_PVADPL1vRykY3g,53
|
|
324
|
+
rapidfireai-0.9.11.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
|
|
325
|
+
rapidfireai-0.9.11.dist-info/RECORD,,
|