PyPI - rapidfireai - Versions diffs - 0.9.10__py3-none-any.whl → 0.9.11__py3-none-any.whl - Mend

rapidfireai 0.9.10py3-none-any.whl → 0.9.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidfireai might be problematic. Click here for more details.

Files changed (19) hide show

rapidfireai/cli.py +23 -3
rapidfireai/experiment.py +5 -1
rapidfireai/start.sh +152 -139
rapidfireai/utils/constants.py +1 -2
rapidfireai/utils/ping.py +29 -0
rapidfireai/utils/shm_manager.py +15 -11
rapidfireai/version.py +2 -2
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/METADATA +31 -7
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/RECORD +19 -12
tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +412 -0
tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +427 -0
tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +358 -0
tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +371 -0
tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +329 -0
tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +331 -0
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/WHEEL +0 -0
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/entry_points.txt +0 -0
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/licenses/LICENSE +0 -0
{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/top_level.txt +0 -0

rapidfireai/utils/constants.py CHANGED Viewed

@@ -10,7 +10,6 @@ SHM_MIN_FREE_SPACE = 1.0
 LOG_FILENAME = "rapidfire.log"
 TRAINING_LOG_FILENAME = "training.log"
 class LogType(Enum):
     """Enum class for log types"""
@@ -33,7 +32,7 @@ class DBConfig:
     # Use user's home directory for database path
     import os
-    DB_PATH: str = os.path.expanduser("~/db/rapidfire.db")
+    DB_PATH: str = os.path.join(os.getenv("RF_DB_PATH", os.path.expanduser(os.path.join("~","db"))), "rapidfire.db")
     # Connection settings
     CONNECTION_TIMEOUT: float = 30.0

rapidfireai/utils/ping.py ADDED Viewed

@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+import socket
+import argparse
+def ping_server(server: str, port: int, timeout=3):
+    """ping server:port """
+    try:
+        socket.setdefaulttimeout(timeout)
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        s.connect((server, port))
+    except OSError as _:
+        return False
+    else:
+        s.close()
+        return True
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Ping a server port")
+    parser.add_argument("server", type=str, help="Server to ping")
+    parser.add_argument("port", type=int, help="Port to ping")
+    parser.add_argument("--timeout", "-t", type=int, help="Timeout in seconds", default=3)
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    args = parser.parse_args()
+    ping_result = ping_server(args.server, args.port, args.timeout)
+    if args.verbose:
+        print(ping_result)
+    if ping_result:
+        exit(0)
+    exit(1)

rapidfireai/utils/shm_manager.py CHANGED Viewed

@@ -286,7 +286,7 @@ class SharedMemoryManager:
             # create model entry in registry
             if model_id not in self._registry:
                 self._registry[model_id] = {SHMObjectType.CHECKPOINTS: {}}
             model_entry = self._registry[model_id]
             if SHMObjectType.CHECKPOINTS not in model_entry:
                 model_entry[SHMObjectType.CHECKPOINTS] = {}
@@ -445,18 +445,22 @@ class SharedMemoryManager:
                     SHMObjectType.CHECKPOINTS: {},
                 }
+            # copy full_model, ref_state_dict, and checkpoints from warm_started_from to model_id
             model_entry = dict(self._registry[model_id])
-            model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
-                dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
-            )
-            model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
-                dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
-            )
-            model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
-                dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
-            )
+            if SHMObjectType.FULL_MODEL in self._registry[warm_started_from]:
+                model_entry[SHMObjectType.FULL_MODEL] = copy.deepcopy(
+                    dict(self._registry[warm_started_from])[SHMObjectType.FULL_MODEL]
+                )
+            if SHMObjectType.REF_STATE_DICT in self._registry[warm_started_from]:
+                model_entry[SHMObjectType.REF_STATE_DICT] = copy.deepcopy(
+                    dict(self._registry[warm_started_from])[SHMObjectType.REF_STATE_DICT]
+                )
+            if SHMObjectType.CHECKPOINTS in self._registry[warm_started_from]:
+                model_entry[SHMObjectType.CHECKPOINTS] = copy.deepcopy(
+                    dict(self._registry[warm_started_from])[SHMObjectType.CHECKPOINTS]
+                )
             self._registry[model_id] = model_entry
-            self.logger.debug(f"Copied warm start checkpoint from {warm_started_from} to {model_id}")
+            self.logger.debug(f"Copied warm start checkpoint from run {warm_started_from} to run {model_id}")
     def list_models(self):
         """Get list of all model IDs currently in shared memory."""

rapidfireai/version.py CHANGED Viewed

@@ -2,5 +2,5 @@
 Version information for RapidFire AI
 """
-__version__ = "0.9.10"
-__version_info__ = (0, 9, 10)
+__version__ = "0.9.11"
+__version_info__ = (0, 9, 11)

{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: rapidfireai
-Version: 0.9.10
+Version: 0.9.11
 Summary: RapidFire AI: Rapid Experimentation Engine for Customizing LLMs
 Author-email: "RapidFire AI Inc." <support@rapidfire.ai>
 License: Apache-2.0
 Project-URL: Homepage, https://rapidfire.ai
-Keywords: ai,rapidfire,rapidfireai,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
+Keywords: ai,rapidfire,rapidfireai,llm,deep-learning,artificial-intelligence,machine-learning,mlflow,experiment-tracking
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -67,13 +67,16 @@ RapidFire AI is a new experiment execution framework that transforms your LLM cu
 ## Getting Started
 ### Prerequisites
 - [NVIDIA GPU using the 7.x or 8.x Compute Capability](https://developer.nvidia.com/cuda-gpus)
 - [NVIDIA CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
 - [Python 3.12.x](https://www.python.org/downloads/)
 - [PyTorch 2.7.1+](https://pytorch.org/get-started/previous-versions/) with corresponding forward compatible prebuilt CUDA binaries
 ### Installation/Starting
 ```bash
 virtualenv -p python3 oss_venv
 source oss_venv/bin/activate
@@ -82,6 +85,8 @@ source oss_venv/bin/activate
 pip install rapidfireai
 # install specific dependencies and initialize rapidfire
+# Optionally set RF_TUTORIAL_PATH environment variable to sepecify
+# alternate location for copying tutorial notebooks to
 rapidfireai init
 # start the rapidfire server
@@ -90,7 +95,18 @@ rapidfireai start
 # open up example notebook and start experiment
 ```
+### Running tutorial notebooks
+```bash
+source oss_venv/bin/activate
+# from replace <your_token> with your hugging face token
+# https://huggingface.co/docs/hub/en/security-tokens
+pip install "huggingface-hub[cli]"
+hf auth login --token <your_token>
+# open up example notebook from ./tutorial_notebooks and start experiment
+```
 ### Troubleshooting
@@ -110,13 +126,14 @@ lsof -t -i:3000 | xargs kill -9  # frontend server
 Browse or reference the full documentation, example use case tutorials, all API details, dashboard details, and more [here](https://rapidfire-ai-oss-docs.readthedocs-hosted.com/).
 ## Key Features
 ### MLflow Integration
 Full MLflow support for experiment tracking and metrics visualization. A named RapidFire AI experiment corresponds to an MLflow experiment for comprehensive governance
 ### Interactive Control Operations (IC Ops)
 First-of-its-kind dynamic real-time control over runs in flight. Can be invoked through the dashboard:
 - Stop active runs; puts them in a dormant state
 - Resume stopped runs; makes them active again
@@ -124,11 +141,12 @@ First-of-its-kind dynamic real-time control over runs in flight. Can be invoked
 - Delete unwanted or failed runs
 ### Multi-GPU Support
 The Scheduler automatically handles multiple GPUs on the machine and divides resources across all running configs for optimal resource utilization.
 ### Search and AutoML Support
-Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
+Built-in procedures for searching over configuration knob combinations, including Grid Search and Random Search. Easy to integrate with AutoML procedures. Native support for some popular AutoML procedures and customized automation of IC Ops coming soon.
 ## Directory Structure
@@ -147,6 +165,7 @@ rapidfireai/
 ## Architecture
 RapidFire AI adopts a microservices-inspired loosely coupled distributed architecture with:
 - **Dispatcher**: Web API layer for UI communication
 - **Database**: SQLite for state persistence
 - **Controller**: Central orchestrator running in user process
@@ -155,30 +174,36 @@ RapidFire AI adopts a microservices-inspired loosely coupled distributed archite
 This design enables efficient resource utilization while providing a seamless user experience for AI experimentation.
 ## Components
 ### Dispatcher
 The dispatcher provides a REST API interface for the web UI. It can be run via Flask as a single app or via Gunicorn to have it load balanced. Handles interactive control features and displays the current state of the runs in the experiment.
 ### Database
 Uses SQLite for persistent storage of metadata of experiments, runs, and artifacts. The Controller also uses it to talk with Workers on scheduling state. A clean asynchronous interface for all DB operations, including experiment lifecycle management and run tracking.
 ### Controller
 Runs as part of the user’s console or Notebook process. Orchestrates the entire training lifecycle including model creation, worker management, and scheduling. The `run_fit` logic handles sample preprocessing, model creation for given knob configurations, worker initialization, and continuous monitoring of training progress across distributed workers.
 ### Worker
 Handles the actual model training and inference on the GPUs. Workers poll the Database for tasks, load dataset chunks, and execute training runs with checkpointing and progress reporting. Currently expects any given model for given batch size to fit on a single GPU.
 ### Experiment
 Manages the complete experiment lifecycle, including creation, naming conventions, and cleanup. Experiments are automatically named with unique suffixes if conflicts exist, and all experiment metadata is tracked in the Database. An experiment's running tasks are automatically cancelled when the process ends abruptly.
 ### Dashboard
-A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
+A fork of MLflow that enables full tracking and visualization of all experiments and runs. It features a new panel for Interactive Control Ops that can be performed on any active runs.
 ## Developing with RapidFire AI
 ### Prerequisites
 - Python 3.x
 - Git
 - Ubuntu/Debian system (for apt package manager)
@@ -244,4 +269,3 @@ lsof -t -i:8080 | xargs kill -9 # dispatcher
 lsof -t -i:5002 | xargs kill -9 # mlflow
 lsof -t -i:3000 | xargs kill -9 # frontend
 ```

{rapidfireai-0.9.10.dist-info → rapidfireai-0.9.11.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 rapidfireai/__init__.py,sha256=mSV8CiaJ9LwjCpMdHSBd9bM-JBijDx-lc8hGny1KEsQ,368
-rapidfireai/cli.py,sha256=MhqpAFkVkqyOc197MnSZaeDlaoN0x4XL6kuYWr3bDBY,14474
-rapidfireai/experiment.py,sha256=jrycddPjS31zSBzcRYDQh6oxJEPw5PjfMsZN5dkGc_s,6754
-rapidfireai/start.sh,sha256=qskOeWeVh9mkLmnLMpWTzwchqijUAesbB2qEzYRtrtU,20846
-rapidfireai/version.py,sha256=4Kvc3fe_9Zyh_tRvg6el6kkBdISgR3ALk1FFO1867ds,99
+rapidfireai/cli.py,sha256=AFaTAhLDbN5jf3o8EWtAmtChQCec4svAihKy-RdKZfk,15226
+rapidfireai/experiment.py,sha256=YvUAzwM3o-gEYECDOmDhbB9szKthfMAw6aiAvJ91gYA,6962
+rapidfireai/start.sh,sha256=o0Mp2EMrZ3lHkTdyCfOqg4JvatN220Kq2VuZNfhFFGg,21268
+rapidfireai/version.py,sha256=nbW2oD0HrwXmHwyIIoFZX7CSAxAW5RLgcRirqDTBqvI,99
 rapidfireai/automl/__init__.py,sha256=QnzWa33i9aMp1NatoQYJFPrGZchtTUAPkgSOyyDXbSU,501
 rapidfireai/automl/base.py,sha256=pF6NQMr8DeEFm4PBbmbUbNAtP0S-yDfeUnKMqz2D9Zk,1947
 rapidfireai/automl/datatypes.py,sha256=rbocXidGekpeukKQuMSZLFK6h6h4PIo1Fvre2FWmhqU,1470
@@ -300,19 +300,26 @@ rapidfireai/ml/checkpoint_utils.py,sha256=L6xMkaFD4onWVP_TJhymYgPI0LrC_TuLgFjoCk
 rapidfireai/ml/trainer.py,sha256=5AMHgS7ZrC0x_K49TedQxQEzRBoGUk81DkJj0Csh4CI,12799
 rapidfireai/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rapidfireai/utils/automl_utils.py,sha256=4IeGZyYRxSdoKk1dBcTI5_JRms70TyiWL9F6Gta31BI,2004
-rapidfireai/utils/constants.py,sha256=W5YT-pgk50uK5M0xW7Ze9IjqzNJQu5Psj5Kct9GuqSQ,3248
+rapidfireai/utils/constants.py,sha256=H2LpiQuJqC58I0P7_J53FmxKEkWWRByLq_-hoQNi30E,3305
 rapidfireai/utils/datapaths.py,sha256=PKgZu_qWx2z6QBIfmzmjY0lWG79GaU6W3577_34yX10,2554
 rapidfireai/utils/exceptions.py,sha256=RA6kMSV3nCz3oE-yhuNLDEneDqTUrZC6N0AkSRBdAlg,2002
 rapidfireai/utils/experiment_utils.py,sha256=7ow1RGk4dnXOKVnkjcHNSYGjLLlVgPlrvnjt_hq_0Ik,14688
 rapidfireai/utils/logging.py,sha256=X6hLKk4alVUhPqs4CdBmPj4ppSOkQ0WoyczNzCWs02E,3050
 rapidfireai/utils/mlflow_manager.py,sha256=iGuA5ubmhTjhxtZrLCsStpCHBAidnnvONb5LVWZv-RE,5046
+rapidfireai/utils/ping.py,sha256=d8d5Ykx-Tn0HRFeo3xzxwc__KMn2t9FvEd7ur9YLts8,976
 rapidfireai/utils/serialize.py,sha256=_A9egs2uhlYNGT3Ntv2fzH7rwp6I-GGVoS4ViY3sufU,401
-rapidfireai/utils/shm_manager.py,sha256=jc38D3GdP1bZyDR-wnQrlCSRW4c_Z_v9rtrZCAJ7-C4,21483
+rapidfireai/utils/shm_manager.py,sha256=OU-EEKMylW-q-oldh5KDmW770gz7yjYvhCw-_IRwquQ,21848
 rapidfireai/utils/trainer_config.py,sha256=91X4-Z8aZl7W-W6Yf-wQINeFPFIf0gvzKT6Z3mfgYXA,587
 rapidfireai/utils/worker_manager.py,sha256=LsXnXC2yDwnIp7tm1shpI6DMpif6XGtZ-4kDoo302tk,7971
-rapidfireai-0.9.10.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-rapidfireai-0.9.10.dist-info/METADATA,sha256=vqE-GQSz4KIv7o8eGx3_E-zSk5CGrvcZgfhlgRRcpMM,10092
-rapidfireai-0.9.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-rapidfireai-0.9.10.dist-info/entry_points.txt,sha256=tuZF1oC4KyQ9H767o83S8Y-ZiGvw_PVADPL1vRykY3g,53
-rapidfireai-0.9.10.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
-rapidfireai-0.9.10.dist-info/RECORD,,
+rapidfireai-0.9.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb,sha256=d3cQ8o0myJC7gyAZDZti9FmCEBpQ49BPbFpGMq-U7lY,13241
+tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb,sha256=3pf7tjYHrmdL1O06nsjI7V_T7LP_AH_Qcgvj5ykv6yE,13854
+tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb,sha256=cl1oxroOLIiVv8yFWGrYqmhKgE7RIBUg7EZCgiv9XG8,11576
+tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb,sha256=Kdeoadw5lrTMQF9Zn42kYhldvQdnD1VLXWgI94Rq8So,12455
+tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb,sha256=v7ITbSqYJgDKFzXJ5Mz4PdQFNCayDFvW6y0CFgao10Y,10468
+tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb,sha256=edaOoWJtQf19zZKL0DEw9QynFvgvP0842Lwsw5cDQ9E,10343
+rapidfireai-0.9.11.dist-info/METADATA,sha256=VKKhbyOtnIEbNMXT7Zz-CNKJOUj6msEXh7Za_I1EuVs,10557
+rapidfireai-0.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+rapidfireai-0.9.11.dist-info/entry_points.txt,sha256=tuZF1oC4KyQ9H767o83S8Y-ZiGvw_PVADPL1vRykY3g,53
+rapidfireai-0.9.11.dist-info/top_level.txt,sha256=A28FddyVhe1LHCbvbigLRtmEWKHGVgOVKH1_FfbUQ2U,12
+rapidfireai-0.9.11.dist-info/RECORD,,

rapidfireai 0.9.10__py3-none-any.whl → 0.9.11__py3-none-any.whl

Potentially problematic release.

rapidfireai 0.9.10py3-none-any.whl → 0.9.11py3-none-any.whl