rapidfireai 0.9.10__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidfireai might be problematic. Click here for more details.
- rapidfireai/cli.py +23 -3
- rapidfireai/experiment.py +5 -1
- rapidfireai/start.sh +152 -139
- rapidfireai/utils/constants.py +1 -2
- rapidfireai/utils/ping.py +29 -0
- rapidfireai/utils/shm_manager.py +15 -11
- rapidfireai/version.py +2 -2
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/METADATA +31 -7
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/RECORD +19 -12
- tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +412 -0
- tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +427 -0
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +358 -0
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +371 -0
- tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +329 -0
- tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +331 -0
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/WHEEL +0 -0
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/entry_points.txt +0 -0
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/licenses/LICENSE +0 -0
- {rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/top_level.txt +0 -0
rapidfireai/utils/constants.py
CHANGED
|
@@ -10,7 +10,6 @@ SHM_MIN_FREE_SPACE = 1.0
|
|
|
10
10
|
LOG_FILENAME = "rapidfire.log"
|
|
11
11
|
TRAINING_LOG_FILENAME = "training.log"
|
|
12
12
|
|
|
13
|
-
|
|
14
13
|
class LogType(Enum):
|
|
15
14
|
"""Enum class for log types"""
|
|
16
15
|
|
|
@@ -33,7 +32,7 @@ class DBConfig:
|
|
|
33
32
|
# Use user's home directory for database path
|
|
34
33
|
import os
|
|
35
34
|
|
|
36
|
-
DB_PATH: str = os.path.expanduser("
|
|
35
|
+
DB_PATH: str = os.path.join(os.getenv("RF_DB_PATH", os.path.expanduser(os.path.join("~","db"))), "rapidfire.db")
|
|
37
36
|
|
|
38
37
|
# Connection settings
|
|
39
38
|
CONNECTION_TIMEOUT: float = 30.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
import socket
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
def ping_server(server: str, port: int, timeout=3):
|
|
6
|
+
"""ping server:port """
|
|
7
|
+
try:
|
|
8
|
+
socket.setdefaulttimeout(timeout)
|
|
9
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
10
|
+
s.connect((server, port))
|
|
11
|
+
except OSError as _:
|
|
12
|
+
return False
|
|
13
|
+
else:
|
|
14
|
+
s.close()
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
parser = argparse.ArgumentParser(description="Ping a server port")
|
|
19
|
+
parser.add_argument("server", type=str, help="Server to ping")
|
|
20
|
+
parser.add_argument("port", type=int, help="Port to ping")
|
|
21
|
+
parser.add_argument("--timeout", "-t", type=int, help="Timeout in seconds", default=3)
|
|
22
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
ping_result = ping_server(args.server, args.port, args.timeout)
|
|
25
|
+
if args.verbose:
|
|
26
|
+
print(ping_result)
|
|
27
|
+
if ping_result:
|
|
28
|
+
exit(0)
|
|
29
|
+
exit(1)
|
rapidfireai/utils/shm_manager.py
CHANGED
|
@@ -286,7 +286,7 @@ class SharedMemoryManager:
|
|
|
286
286
|
# create model entry in registry
|
|
287
287
|
if model_id not in self._registry:
|
|
288
288
|
self._registry[model_id] = {SHMObjectType.CHECKPOINTS: {}}
|
|
289
|
-
|
|
289
|
+
|
|
290
290
|
model_entry = self._registry[model_id]
|
|
291
291
|
if SHMObjectType.CHECKPOINTS not in model_entry:
|
|
292
292
|
model_entry[SHMObjectType.CHECKPOINTS] = {}
|
|
@@ -445,18 +445,22 @@ class SharedMemoryManager:
|
|
|
445
445
|
SHMObjectType.CHECKPOINTS: {},
|
|
446
446
|
}
|
|
447
447
|
|
|
448
|
+
# copy full_model, ref_state_dict, and checkpoints from warm_started_from to model_id
|
|
448
449
|
model_entry = dict(self._registry[model_id])
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
450
|
+
if SHMObjectType.FULL_MODEL in self._registry[warm_started_from]:
|
|
451
|
+
model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
|
|
452
|
+
dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
|
|
453
|
+
)
|
|
454
|
+
if SHMObjectType.REF_STATE_DICT in self._registry[warm_started_from]:
|
|
455
|
+
model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
|
|
456
|
+
dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
|
|
457
|
+
)
|
|
458
|
+
if SHMObjectType.CHECKPOINTS in self._registry[warm_started_from]:
|
|
459
|
+
model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
|
|
460
|
+
dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
|
|
461
|
+
)
|
|
458
462
|
self._registry[model_id] = model_entry
|
|
459
|
-
self.logger.debug(f"Copied warm start checkpoint from {warm_started_from} to {model_id}")
|
|
463
|
+
self.logger.debug(f"Copied warm start checkpoint from run {warm_started_from} to run {model_id}")
|
|
460
464
|
|
|
461
465
|
def list_models(self):
|
|
462
466
|
"""Get list of all model IDs currently in shared memory."""
|
rapidfireai/version.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rapidfireai
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.11
|
|
4
4
|
Summary: RapidFire AI: Rapid Experimentation Engine for Customizing LLMs
|
|
5
5
|
Author-email: "RapidFire AI Inc." <support@rapidfire.ai>
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Project-URL: Homepage, https://rapidfire.ai
|
|
8
|
-
Keywords: ai,rapidfire,rapidfireai,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
|
|
8
|
+
Keywords: ai,rapidfire,rapidfireai,llm,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
|
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
|
10
10
|
Classifier: Intended Audience :: Developers
|
|
11
11
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -67,13 +67,16 @@ RapidFire AI is a new experiment execution framework that transforms your LLM cu
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
## Getting Started
|
|
70
|
+
|
|
70
71
|
### Prerequisites
|
|
72
|
+
|
|
71
73
|
- [NVIDIA GPU using the 7.x or 8.x Compute Capability](https://developer.nvidia.com/cuda-gpus)
|
|
72
74
|
- [NVIDIA CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
|
|
73
75
|
- [Python 3.12.x](https://www.python.org/downloads/)
|
|
74
76
|
- [PyTorch 2.7.1+](https://pytorch.org/get-started/previous-versions/) with corresponding forward compatible prebuilt CUDA binaries
|
|
75
77
|
|
|
76
78
|
### Installation/Starting
|
|
79
|
+
|
|
77
80
|
```bash
|
|
78
81
|
virtualenv -p python3 oss_venv
|
|
79
82
|
source oss_venv/bin/activate
|
|
@@ -82,6 +85,8 @@ source oss_venv/bin/activate
|
|
|
82
85
|
pip install rapidfireai
|
|
83
86
|
|
|
84
87
|
# install specific dependencies and initialize rapidfire
|
|
88
|
+
# Optionally set RF_TUTORIAL_PATH environment variable to sepecify
|
|
89
|
+
# alternate location for copying tutorial notebooks to
|
|
85
90
|
rapidfireai init
|
|
86
91
|
|
|
87
92
|
# start the rapidfire server
|
|
@@ -90,7 +95,18 @@ rapidfireai start
|
|
|
90
95
|
# open up example notebook and start experiment
|
|
91
96
|
```
|
|
92
97
|
|
|
98
|
+
### Running tutorial notebooks
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
source oss_venv/bin/activate
|
|
102
|
+
|
|
103
|
+
# from replace <your_token> with your hugging face token
|
|
104
|
+
# https://huggingface.co/docs/hub/en/security-tokens
|
|
105
|
+
pip install "huggingface-hub[cli]"
|
|
106
|
+
hf auth login --token <your_token>
|
|
93
107
|
|
|
108
|
+
# open up example notebook from ./tutorial_notebooks and start experiment
|
|
109
|
+
```
|
|
94
110
|
|
|
95
111
|
### Troubleshooting
|
|
96
112
|
|
|
@@ -110,13 +126,14 @@ lsof -t -i:3000 | xargs kill -9 # frontend server
|
|
|
110
126
|
|
|
111
127
|
Browse or reference the full documentation, example use case tutorials, all API details, dashboard details, and more [here](https://rapidfire-ai-oss-docs.readthedocs-hosted.com/).
|
|
112
128
|
|
|
113
|
-
|
|
114
129
|
## Key Features
|
|
115
130
|
|
|
116
131
|
### MLflow Integration
|
|
132
|
+
|
|
117
133
|
Full MLflow support for experiment tracking and metrics visualization. A named RapidFire AI experiment corresponds to an MLflow experiment for comprehensive governance
|
|
118
134
|
|
|
119
135
|
### Interactive Control Operations (IC Ops)
|
|
136
|
+
|
|
120
137
|
First-of-its-kind dynamic real-time control over runs in flight. Can be invoked through the dashboard:
|
|
121
138
|
- Stop active runs; puts them in a dormant state
|
|
122
139
|
- Resume stopped runs; makes them active again
|
|
@@ -124,11 +141,12 @@ First-of-its-kind dynamic real-time control over runs in flight. Can be invoked
|
|
|
124
141
|
- Delete unwanted or failed runs
|
|
125
142
|
|
|
126
143
|
### Multi-GPU Support
|
|
144
|
+
|
|
127
145
|
The Scheduler automatically handles multiple GPUs on the machine and divides resources across all running configs for optimal resource utilization.
|
|
128
146
|
|
|
129
147
|
### Search and AutoML Support
|
|
130
|
-
Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
|
|
131
148
|
|
|
149
|
+
Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
|
|
132
150
|
|
|
133
151
|
## Directory Structure
|
|
134
152
|
|
|
@@ -147,6 +165,7 @@ rapidfireai/
|
|
|
147
165
|
## Architecture
|
|
148
166
|
|
|
149
167
|
RapidFire AI adopts a microservices-inspired loosely coupled distributed architecture with:
|
|
168
|
+
|
|
150
169
|
- **Dispatcher**: Web API layer for UI communication
|
|
151
170
|
- **Database**: SQLite for state persistence
|
|
152
171
|
- **Controller**: Central orchestrator running in user process
|
|
@@ -155,30 +174,36 @@ RapidFire AI adopts a microservices-inspired loosely coupled distributed archite
|
|
|
155
174
|
|
|
156
175
|
This design enables efficient resource utilization while providing a seamless user experience for AI experimentation.
|
|
157
176
|
|
|
158
|
-
|
|
159
177
|
## Components
|
|
160
178
|
|
|
161
179
|
### Dispatcher
|
|
180
|
+
|
|
162
181
|
The dispatcher provides a REST API interface for the web UI. It can be run via Flask as a single app or via Gunicorn to have it load balanced. Handles interactive control features and displays the current state of the runs in the experiment.
|
|
163
182
|
|
|
164
183
|
### Database
|
|
184
|
+
|
|
165
185
|
Uses SQLite for persistent storage of metadata of experiments, runs, and artifacts. The Controller also uses it to talk with Workers on scheduling state. A clean asynchronous interface for all DB operations, including experiment lifecycle management and run tracking.
|
|
166
186
|
|
|
167
187
|
### Controller
|
|
188
|
+
|
|
168
189
|
Runs as part of the user’s console or Notebook process. Orchestrates the entire training lifecycle including model creation, worker management, and scheduling. The `run_fit` logic handles sample preprocessing, model creation for given knob configurations, worker initialization, and continuous monitoring of training progress across distributed workers.
|
|
169
190
|
|
|
170
191
|
### Worker
|
|
192
|
+
|
|
171
193
|
Handles the actual model training and inference on the GPUs. Workers poll the Database for tasks, load dataset chunks, and execute training runs with checkpointing and progress reporting. Currently expects any given model for given batch size to fit on a single GPU.
|
|
172
194
|
|
|
173
195
|
### Experiment
|
|
196
|
+
|
|
174
197
|
Manages the complete experiment lifecycle, including creation, naming conventions, and cleanup. Experiments are automatically named with unique suffixes if conflicts exist, and all experiment metadata is tracked in the Database. An experiment's running tasks are automatically cancelled when the process ends abruptly.
|
|
175
198
|
|
|
176
199
|
### Dashboard
|
|
177
|
-
A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
|
|
178
200
|
|
|
201
|
+
A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
|
|
179
202
|
|
|
180
203
|
## Developing with RapidFire AI
|
|
204
|
+
|
|
181
205
|
### Prerequisites
|
|
206
|
+
|
|
182
207
|
- Python 3.x
|
|
183
208
|
- Git
|
|
184
209
|
- Ubuntu/Debian system (for apt package manager)
|
|
@@ -244,4 +269,3 @@ lsof -t -i:8080 | xargs kill -9 # dispatcher
|
|
|
244
269
|
lsof -t -i:5002 | xargs kill -9 # mlflow
|
|
245
270
|
lsof -t -i:3000 | xargs kill -9 # frontend
|
|
246
271
|
```
|
|
247
|
-
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
rapidfireai/__init__.py,sha256=mSV8CiaJ9LwjCpMdHSBd9bM-JBijDx-lc8hGny1KEsQ,368
|
|
2
|
-
rapidfireai/cli.py,sha256=
|
|
3
|
-
rapidfireai/experiment.py,sha256=
|
|
4
|
-
rapidfireai/start.sh,sha256=
|
|
5
|
-
rapidfireai/version.py,sha256=
|
|
2
|
+
rapidfireai/cli.py,sha256=AFaTAhLDbN5jf3o8EWtAmtChQCec4svAihKy-RdKZfk,15226
|
|
3
|
+
rapidfireai/experiment.py,sha256=YvUAzwM3o-gEYECDOmDhbB9szKthfMAw6aiAvJ91gYA,6962
|
|
4
|
+
rapidfireai/start.sh,sha256=o0Mp2EMrZ3lHkTdyCfOqg4JvatN220Kq2VuZNfhFFGg,21268
|
|
5
|
+
rapidfireai/version.py,sha256=nbW2oD0HrwXmHwyIIoFZX7CSAxAW5RLgcRirqDTBqvI,99
|
|
6
6
|
rapidfireai/automl/__init__.py,sha256=QnzWa33i9aMp1NatoQYJFPrGZchtTUAPkgSOyyDXbSU,501
|
|
7
7
|
rapidfireai/automl/base.py,sha256=pF6NQMr8DeEFm4PBbmbUbNAtP0S-yDfeUnKMqz2D9Zk,1947
|
|
8
8
|
rapidfireai/automl/datatypes.py,sha256=rbocXidGekpeukKQuMSZLFK6h6h4PIo1Fvre2FWmhqU,1470
|
|
@@ -300,19 +300,26 @@ rapidfireai/ml/checkpoint_utils.py,sha256=L6xMkaFD4onWVP_TJhymYgPI0LrC_TuLgFjoCk
|
|
|
300
300
|
rapidfireai/ml/trainer.py,sha256=5AMHgS7ZrC0x_K49TedQxQEzRBoGUk81DkJj0Csh4CI,12799
|
|
301
301
|
rapidfireai/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
302
302
|
rapidfireai/utils/automl_utils.py,sha256=4IeGZyYRxSdoKk1dBcTI5_JRms70TyiWL9F6Gta31BI,2004
|
|
303
|
-
rapidfireai/utils/constants.py,sha256=
|
|
303
|
+
rapidfireai/utils/constants.py,sha256=H2LpiQuJqC58I0P7_J53FmxKEkWWRByLq_-hoQNi30E,3305
|
|
304
304
|
rapidfireai/utils/datapaths.py,sha256=PKgZu_qWx2z6QBIfmzmjY0lWG79GaU6W3577_34yX10,2554
|
|
305
305
|
rapidfireai/utils/exceptions.py,sha256=RA6kMSV3nCz3oE-yhuNLDEneDqTUrZC6N0AkSRBdAlg,2002
|
|
306
306
|
rapidfireai/utils/experiment_utils.py,sha256=7ow1RGk4dnXOKVnkjcHNSYGjLLlVgPlrvnjt_hq_0Ik,14688
|
|
307
307
|
rapidfireai/utils/logging.py,sha256=X6hLKk4alVUhPqs4CdBmPj4ppSOkQ0WoyczNzCWs02E,3050
|
|
308
308
|
rapidfireai/utils/mlflow_manager.py,sha256=iGuA5ubmhTjhxtZrLCsStpCHBAidnnvONb5LVWZv-RE,5046
|
|
309
|
+
rapidfireai/utils/ping.py,sha256=d8d5Ykx-Tn0HRFeo3xzxwc__KMn2t9FvEd7ur9YLts8,976
|
|
309
310
|
rapidfireai/utils/serialize.py,sha256=_A9egs2uhlYNGT3Ntv2fzH7rwp6I-GGVoS4ViY3sufU,401
|
|
310
|
-
rapidfireai/utils/shm_manager.py,sha256=
|
|
311
|
+
rapidfireai/utils/shm_manager.py,sha256=OU-EEKMylW-q-oldh5KDmW770gz7yjYvhCw-_IRwquQ,21848
|
|
311
312
|
rapidfireai/utils/trainer_config.py,sha256=91X4-Z8aZl7W-W6Yf-wQINeFPFIf0gvzKT6Z3mfgYXA,587
|
|
312
313
|
rapidfireai/utils/worker_manager.py,sha256=LsXnXC2yDwnIp7tm1shpI6DMpif6XGtZ-4kDoo302tk,7971
|
|
313
|
-
rapidfireai-0.9.
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
314
|
+
rapidfireai-0.9.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
315
|
+
tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb,sha256=d3cQ8o0myJC7gyAZDZti9FmCEBpQ49BPbFpGMq-U7lY,13241
|
|
316
|
+
tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb,sha256=3pf7tjYHrmdL1O06nsjI7V_T7LP_AH_Qcgvj5ykv6yE,13854
|
|
317
|
+
tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb,sha256=cl1oxroOLIiVv8yFWGrYqmhKgE7RIBUg7EZCgiv9XG8,11576
|
|
318
|
+
tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb,sha256=Kdeoadw5lrTMQF9Zn42kYhldvQdnD1VLXWgI94Rq8So,12455
|
|
319
|
+
tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb,sha256=v7ITbSqYJgDKFzXJ5Mz4PdQFNCayDFvW6y0CFgao10Y,10468
|
|
320
|
+
tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb,sha256=edaOoWJtQf19zZKL0DEw9QynFvgvP0842Lwsw5cDQ9E,10343
|
|
321
|
+
rapidfireai-0.9.11.dist-info/METADATA,sha256=VKKhbyOtnIEbNMXT7Zz-CNKJOUj6msEXh7Za_I1EuVs,10557
|
|
322
|
+
rapidfireai-0.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
323
|
+
rapidfireai-0.9.11.dist-info/entry_points.txt,sha256=tuZF1oC4KyQ9H767o83S8Y-ZiGvw_PVADPL1vRykY3g,53
|
|
324
|
+
rapidfireai-0.9.11.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
|
|
325
|
+
rapidfireai-0.9.11.dist-info/RECORD,,
|