rapidfireai 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidfireai might be problematic. Click here for more details.

@@ -10,7 +10,6 @@ SHM_MIN_FREE_SPACE = 1.0
10
10
  LOG_FILENAME = "rapidfire.log"
11
11
  TRAINING_LOG_FILENAME = "training.log"
12
12
 
13
-
14
13
  class LogType(Enum):
15
14
  """Enum class for log types"""
16
15
 
@@ -33,7 +32,7 @@ class DBConfig:
33
32
  # Use user's home directory for database path
34
33
  import os
35
34
 
36
- DB_PATH: str = os.path.expanduser("~/db/rapidfire.db")
35
+ DB_PATH: str = os.path.join(os.getenv("RF_DB_PATH", os.path.expanduser(os.path.join("~","db"))), "rapidfire.db")
37
36
 
38
37
  # Connection settings
39
38
  CONNECTION_TIMEOUT: float = 30.0
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python
2
+ import socket
3
+ import argparse
4
+
5
+ def ping_server(server: str, port: int, timeout=3):
6
+ """ping server:port """
7
+ try:
8
+ socket.setdefaulttimeout(timeout)
9
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
10
+ s.connect((server, port))
11
+ except OSError as _:
12
+ return False
13
+ else:
14
+ s.close()
15
+ return True
16
+
17
+ if __name__ == "__main__":
18
+ parser = argparse.ArgumentParser(description="Ping a server port")
19
+ parser.add_argument("server", type=str, help="Server to ping")
20
+ parser.add_argument("port", type=int, help="Port to ping")
21
+ parser.add_argument("--timeout", "-t", type=int, help="Timeout in seconds", default=3)
22
+ parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
23
+ args = parser.parse_args()
24
+ ping_result = ping_server(args.server, args.port, args.timeout)
25
+ if args.verbose:
26
+ print(ping_result)
27
+ if ping_result:
28
+ exit(0)
29
+ exit(1)
@@ -286,7 +286,7 @@ class SharedMemoryManager:
286
286
  # create model entry in registry
287
287
  if model_id not in self._registry:
288
288
  self._registry[model_id] = {SHMObjectType.CHECKPOINTS: {}}
289
-
289
+
290
290
  model_entry = self._registry[model_id]
291
291
  if SHMObjectType.CHECKPOINTS not in model_entry:
292
292
  model_entry[SHMObjectType.CHECKPOINTS] = {}
@@ -445,18 +445,22 @@ class SharedMemoryManager:
445
445
  SHMObjectType.CHECKPOINTS: {},
446
446
  }
447
447
 
448
+ # copy full_model, ref_state_dict, and checkpoints from warm_started_from to model_id
448
449
  model_entry = dict(self._registry[model_id])
449
- model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
450
- dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
451
- )
452
- model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
453
- dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
454
- )
455
- model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
456
- dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
457
- )
450
+ if SHMObjectType.FULL_MODEL in self._registry[warm_started_from]:
451
+ model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
452
+ dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
453
+ )
454
+ if SHMObjectType.REF_STATE_DICT in self._registry[warm_started_from]:
455
+ model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
456
+ dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
457
+ )
458
+ if SHMObjectType.CHECKPOINTS in self._registry[warm_started_from]:
459
+ model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
460
+ dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
461
+ )
458
462
  self._registry[model_id] = model_entry
459
- self.logger.debug(f"Copied warm start checkpoint from {warm_started_from} to {model_id}")
463
+ self.logger.debug(f"Copied warm start checkpoint from run {warm_started_from} to run {model_id}")
460
464
 
461
465
  def list_models(self):
462
466
  """Get list of all model IDs currently in shared memory."""
rapidfireai/version.py CHANGED
@@ -2,5 +2,5 @@
2
2
  Version information for RapidFire AI
3
3
  """
4
4
 
5
- __version__ = "0.9.9"
6
- __version_info__ = (0, 9, 9)
5
+ __version__ = "0.9.11"
6
+ __version_info__ = (0, 9, 11)
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rapidfireai
3
- Version: 0.9.9
4
- Summary: RapidFire AI - Machine Learning Platform
3
+ Version: 0.9.11
4
+ Summary: RapidFire AI: Rapid Experimentation Engine for Customizing LLMs
5
5
  Author-email: "RapidFire AI Inc." <support@rapidfire.ai>
6
6
  License: Apache-2.0
7
7
  Project-URL: Homepage, https://rapidfire.ai
8
- Keywords: ai,rapidfire,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
8
+ Keywords: ai,rapidfire,rapidfireai,llm,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
9
9
  Classifier: Development Status :: 4 - Beta
10
10
  Classifier: Intended Audience :: Developers
11
11
  Classifier: Intended Audience :: Science/Research
@@ -63,17 +63,20 @@ Rapid experimentation for easier, faster, and more impactful fine-tuning and pos
63
63
 
64
64
  RapidFire AI is a new experiment execution framework that transforms your LLM customization experimentation from slow, sequential processes into rapid, intelligent workflows with hyperparallelized training, dynamic real-time experiment control, and automatic multi-GPU system orchestration.
65
65
 
66
- ![Usage workflow of RapidFire AI](./usage.png)
66
+ ![Usage workflow of RapidFire AI](https://raw.githubusercontent.com/RapidFireAI/rapidfireai/main/usage.png)
67
67
 
68
68
 
69
69
  ## Getting Started
70
+
70
71
  ### Prerequisites
72
+
71
73
  - [NVIDIA GPU using the 7.x or 8.x Compute Capability](https://developer.nvidia.com/cuda-gpus)
72
74
  - [NVIDIA CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
73
75
  - [Python 3.12.x](https://www.python.org/downloads/)
74
76
  - [PyTorch 2.7.1+](https://pytorch.org/get-started/previous-versions/) with corresponding forward compatible prebuilt CUDA binaries
75
77
 
76
78
  ### Installation/Starting
79
+
77
80
  ```bash
78
81
  virtualenv -p python3 oss_venv
79
82
  source oss_venv/bin/activate
@@ -82,18 +85,36 @@ source oss_venv/bin/activate
82
85
  pip install rapidfireai
83
86
 
84
87
  # install specific dependencies and initialize rapidfire
85
- rapidfire init
88
+ # Optionally set RF_TUTORIAL_PATH environment variable to sepecify
89
+ # alternate location for copying tutorial notebooks to
90
+ rapidfireai init
86
91
 
87
92
  # start the rapidfire server
88
- rapidfire start
93
+ rapidfireai start
89
94
 
90
95
  # open up example notebook and start experiment
91
96
  ```
92
97
 
98
+ ### Running tutorial notebooks
99
+
100
+ ```bash
101
+ source oss_venv/bin/activate
93
102
 
103
+ # from replace <your_token> with your hugging face token
104
+ # https://huggingface.co/docs/hub/en/security-tokens
105
+ pip install "huggingface-hub[cli]"
106
+ hf auth login --token <your_token>
107
+
108
+ # open up example notebook from ./tutorial_notebooks and start experiment
109
+ ```
94
110
 
95
111
  ### Troubleshooting
96
112
 
113
+ For a quick system diagnostics report (Python env, relevant packages, GPU/CUDA, and key environment variables), run:
114
+ ```bash
115
+ rapidfireai doctor
116
+ ```
117
+
97
118
  If you encounter port conflicts, you can kill existing processes:
98
119
  ```bash
99
120
  lsof -t -i:5002 | xargs kill -9 # mlflow
@@ -105,13 +126,14 @@ lsof -t -i:3000 | xargs kill -9 # frontend server
105
126
 
106
127
  Browse or reference the full documentation, example use case tutorials, all API details, dashboard details, and more [here](https://rapidfire-ai-oss-docs.readthedocs-hosted.com/).
107
128
 
108
-
109
129
  ## Key Features
110
130
 
111
131
  ### MLflow Integration
132
+
112
133
  Full MLflow support for experiment tracking and metrics visualization. A named RapidFire AI experiment corresponds to an MLflow experiment for comprehensive governance
113
134
 
114
135
  ### Interactive Control Operations (IC Ops)
136
+
115
137
  First-of-its-kind dynamic real-time control over runs in flight. Can be invoked through the dashboard:
116
138
  - Stop active runs; puts them in a dormant state
117
139
  - Resume stopped runs; makes them active again
@@ -119,11 +141,12 @@ First-of-its-kind dynamic real-time control over runs in flight. Can be invoked
119
141
  - Delete unwanted or failed runs
120
142
 
121
143
  ### Multi-GPU Support
144
+
122
145
  The Scheduler automatically handles multiple GPUs on the machine and divides resources across all running configs for optimal resource utilization.
123
146
 
124
147
  ### Search and AutoML Support
125
- Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
126
148
 
149
+ Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
127
150
 
128
151
  ## Directory Structure
129
152
 
@@ -142,6 +165,7 @@ rapidfireai/
142
165
  ## Architecture
143
166
 
144
167
  RapidFire AI adopts a microservices-inspired loosely coupled distributed architecture with:
168
+
145
169
  - **Dispatcher**: Web API layer for UI communication
146
170
  - **Database**: SQLite for state persistence
147
171
  - **Controller**: Central orchestrator running in user process
@@ -150,30 +174,36 @@ RapidFire AI adopts a microservices-inspired loosely coupled distributed archite
150
174
 
151
175
  This design enables efficient resource utilization while providing a seamless user experience for AI experimentation.
152
176
 
153
-
154
177
  ## Components
155
178
 
156
179
  ### Dispatcher
180
+
157
181
  The dispatcher provides a REST API interface for the web UI. It can be run via Flask as a single app or via Gunicorn to have it load balanced. Handles interactive control features and displays the current state of the runs in the experiment.
158
182
 
159
183
  ### Database
184
+
160
185
  Uses SQLite for persistent storage of metadata of experiments, runs, and artifacts. The Controller also uses it to talk with Workers on scheduling state. A clean asynchronous interface for all DB operations, including experiment lifecycle management and run tracking.
161
186
 
162
187
  ### Controller
188
+
163
189
  Runs as part of the user’s console or Notebook process. Orchestrates the entire training lifecycle including model creation, worker management, and scheduling. The `run_fit` logic handles sample preprocessing, model creation for given knob configurations, worker initialization, and continuous monitoring of training progress across distributed workers.
164
190
 
165
191
  ### Worker
192
+
166
193
  Handles the actual model training and inference on the GPUs. Workers poll the Database for tasks, load dataset chunks, and execute training runs with checkpointing and progress reporting. Currently expects any given model for given batch size to fit on a single GPU.
167
194
 
168
195
  ### Experiment
196
+
169
197
  Manages the complete experiment lifecycle, including creation, naming conventions, and cleanup. Experiments are automatically named with unique suffixes if conflicts exist, and all experiment metadata is tracked in the Database. An experiment's running tasks are automatically cancelled when the process ends abruptly.
170
198
 
171
199
  ### Dashboard
172
- A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
173
200
 
201
+ A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
174
202
 
175
203
  ## Developing with RapidFire AI
204
+
176
205
  ### Prerequisites
206
+
177
207
  - Python 3.x
178
208
  - Git
179
209
  - Ubuntu/Debian system (for apt package manager)
@@ -239,4 +269,3 @@ lsof -t -i:8080 | xargs kill -9 # dispatcher
239
269
  lsof -t -i:5002 | xargs kill -9 # mlflow
240
270
  lsof -t -i:3000 | xargs kill -9 # frontend
241
271
  ```
242
-
@@ -1,8 +1,8 @@
1
1
  rapidfireai/__init__.py,sha256=mSV8CiaJ9LwjCpMdHSBd9bM-JBijDx-lc8hGny1KEsQ,368
2
- rapidfireai/cli.py,sha256=A6hSRsUAzL0Uo-6JMzYpb_YClrWFyoZjyLob3OQprHw,14476
3
- rapidfireai/experiment.py,sha256=jrycddPjS31zSBzcRYDQh6oxJEPw5PjfMsZN5dkGc_s,6754
4
- rapidfireai/start.sh,sha256=SukxhvLrLrxMSisKMlL6zaOpxmVboQiCeaaPw0N0vFo,20763
5
- rapidfireai/version.py,sha256=BDnMWj_SU0dPtTGuxNphAptiTmkfPwACgnkuGniH-1w,97
2
+ rapidfireai/cli.py,sha256=AFaTAhLDbN5jf3o8EWtAmtChQCec4svAihKy-RdKZfk,15226
3
+ rapidfireai/experiment.py,sha256=YvUAzwM3o-gEYECDOmDhbB9szKthfMAw6aiAvJ91gYA,6962
4
+ rapidfireai/start.sh,sha256=o0Mp2EMrZ3lHkTdyCfOqg4JvatN220Kq2VuZNfhFFGg,21268
5
+ rapidfireai/version.py,sha256=nbW2oD0HrwXmHwyIIoFZX7CSAxAW5RLgcRirqDTBqvI,99
6
6
  rapidfireai/automl/__init__.py,sha256=QnzWa33i9aMp1NatoQYJFPrGZchtTUAPkgSOyyDXbSU,501
7
7
  rapidfireai/automl/base.py,sha256=pF6NQMr8DeEFm4PBbmbUbNAtP0S-yDfeUnKMqz2D9Zk,1947
8
8
  rapidfireai/automl/datatypes.py,sha256=rbocXidGekpeukKQuMSZLFK6h6h4PIo1Fvre2FWmhqU,1470
@@ -300,19 +300,26 @@ rapidfireai/ml/checkpoint_utils.py,sha256=L6xMkaFD4onWVP_TJhymYgPI0LrC_TuLgFjoCk
300
300
  rapidfireai/ml/trainer.py,sha256=5AMHgS7ZrC0x_K49TedQxQEzRBoGUk81DkJj0Csh4CI,12799
301
301
  rapidfireai/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
302
302
  rapidfireai/utils/automl_utils.py,sha256=4IeGZyYRxSdoKk1dBcTI5_JRms70TyiWL9F6Gta31BI,2004
303
- rapidfireai/utils/constants.py,sha256=W5YT-pgk50uK5M0xW7Ze9IjqzNJQu5Psj5Kct9GuqSQ,3248
303
+ rapidfireai/utils/constants.py,sha256=H2LpiQuJqC58I0P7_J53FmxKEkWWRByLq_-hoQNi30E,3305
304
304
  rapidfireai/utils/datapaths.py,sha256=PKgZu_qWx2z6QBIfmzmjY0lWG79GaU6W3577_34yX10,2554
305
305
  rapidfireai/utils/exceptions.py,sha256=RA6kMSV3nCz3oE-yhuNLDEneDqTUrZC6N0AkSRBdAlg,2002
306
306
  rapidfireai/utils/experiment_utils.py,sha256=7ow1RGk4dnXOKVnkjcHNSYGjLLlVgPlrvnjt_hq_0Ik,14688
307
307
  rapidfireai/utils/logging.py,sha256=X6hLKk4alVUhPqs4CdBmPj4ppSOkQ0WoyczNzCWs02E,3050
308
308
  rapidfireai/utils/mlflow_manager.py,sha256=iGuA5ubmhTjhxtZrLCsStpCHBAidnnvONb5LVWZv-RE,5046
309
+ rapidfireai/utils/ping.py,sha256=d8d5Ykx-Tn0HRFeo3xzxwc__KMn2t9FvEd7ur9YLts8,976
309
310
  rapidfireai/utils/serialize.py,sha256=_A9egs2uhlYNGT3Ntv2fzH7rwp6I-GGVoS4ViY3sufU,401
310
- rapidfireai/utils/shm_manager.py,sha256=jc38D3GdP1bZyDR-wnQrlCSRW4c_Z_v9rtrZCAJ7-C4,21483
311
+ rapidfireai/utils/shm_manager.py,sha256=OU-EEKMylW-q-oldh5KDmW770gz7yjYvhCw-_IRwquQ,21848
311
312
  rapidfireai/utils/trainer_config.py,sha256=91X4-Z8aZl7W-W6Yf-wQINeFPFIf0gvzKT6Z3mfgYXA,587
312
313
  rapidfireai/utils/worker_manager.py,sha256=LsXnXC2yDwnIp7tm1shpI6DMpif6XGtZ-4kDoo302tk,7971
313
- rapidfireai-0.9.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
314
- rapidfireai-0.9.9.dist-info/METADATA,sha256=cgVnVMDfBDci0y_sjo7e37rjkr7I_W4lB2qTAALjU8o,9842
315
- rapidfireai-0.9.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
316
- rapidfireai-0.9.9.dist-info/entry_points.txt,sha256=-384aiPXnSnhFE4OTDAu1DmqHL5X4tEzTIvSee0x6nc,51
317
- rapidfireai-0.9.9.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
318
- rapidfireai-0.9.9.dist-info/RECORD,,
314
+ rapidfireai-0.9.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
315
+ tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb,sha256=d3cQ8o0myJC7gyAZDZti9FmCEBpQ49BPbFpGMq-U7lY,13241
316
+ tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb,sha256=3pf7tjYHrmdL1O06nsjI7V_T7LP_AH_Qcgvj5ykv6yE,13854
317
+ tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb,sha256=cl1oxroOLIiVv8yFWGrYqmhKgE7RIBUg7EZCgiv9XG8,11576
318
+ tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb,sha256=Kdeoadw5lrTMQF9Zn42kYhldvQdnD1VLXWgI94Rq8So,12455
319
+ tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb,sha256=v7ITbSqYJgDKFzXJ5Mz4PdQFNCayDFvW6y0CFgao10Y,10468
320
+ tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb,sha256=edaOoWJtQf19zZKL0DEw9QynFvgvP0842Lwsw5cDQ9E,10343
321
+ rapidfireai-0.9.11.dist-info/METADATA,sha256=VKKhbyOtnIEbNMXT7Zz-CNKJOUj6msEXh7Za_I1EuVs,10557
322
+ rapidfireai-0.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
323
+ rapidfireai-0.9.11.dist-info/entry_points.txt,sha256=tuZF1oC4KyQ9H767o83S8Y-ZiGvw_PVADPL1vRykY3g,53
324
+ rapidfireai-0.9.11.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
325
+ rapidfireai-0.9.11.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ rapidfireai = rapidfireai.cli:main