PyPI - rapidfireai - Versions diffs - 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl - Mend

rapidfireai 0.10.2rc4py3-none-any.whl → 0.10.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidfireai might be problematic. Click here for more details.

Files changed (37) hide show

rapidfireai/backend/controller.py +29 -16
rapidfireai/backend/worker.py +14 -7
rapidfireai/cli.py +28 -1
rapidfireai/db/rf_db.py +1 -1
rapidfireai/db/tables.sql +1 -1
rapidfireai/dispatcher/dispatcher.py +3 -1
rapidfireai/dispatcher/gunicorn.conf.py +1 -1
rapidfireai/experiment.py +75 -7
rapidfireai/frontend/build/asset-manifest.json +3 -3
rapidfireai/frontend/build/index.html +1 -1
rapidfireai/frontend/build/static/js/{main.3ff1e37d.js → main.e7d3b759.js} +3 -3
rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.map → main.e7d3b759.js.map} +1 -1
rapidfireai/frontend/proxy_middleware.py +1 -1
rapidfireai/ml/callbacks.py +78 -38
rapidfireai/ml/trainer.py +6 -6
rapidfireai/start.sh +117 -34
rapidfireai/utils/constants.py +20 -1
rapidfireai/utils/experiment_utils.py +87 -43
rapidfireai/utils/interactive_controller.py +494 -0
rapidfireai/utils/metric_logger.py +346 -0
rapidfireai/utils/mlflow_manager.py +0 -2
rapidfireai/utils/worker_manager.py +16 -6
rapidfireai/version.py +2 -2
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +37 -34
tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +6 -6
tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +6 -6
tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +6 -6
/rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0

rapidfireai/utils/experiment_utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ import sys
 import warnings
 from typing import Any
-import mlflow
 import pandas as pd
 import torch
 from IPython.display import display
@@ -16,11 +15,13 @@ from tqdm import tqdm
 from transformers import logging as transformers_logging
 from rapidfireai.db.rf_db import RfDb
-from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
+from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
 from rapidfireai.utils.datapaths import DataPath
 from rapidfireai.utils.exceptions import DBException, ExperimentException
 from rapidfireai.utils.logging import RFLogger
-from rapidfireai.utils.mlflow_manager import MLflowManager
+# Note: mlflow and MLflowManager are imported lazily inside conditional blocks
+# to avoid MLflow connection attempts when using tensorboard-only mode
 class ExperimentUtils:
@@ -82,12 +83,16 @@ class ExperimentUtils:
         self._disable_ml_warnings_display()
         # Clear any existing MLflow context before starting new experiment
-        try:
-            if mlflow.active_run():
-                print("Clearing existing MLflow context before starting new experiment")
-                mlflow.end_run()
-        except Exception as e:
-            print(f"Error clearing existing MLflow context: {e}")
+        # Only if using MLflow backend
+        tracking_backend = get_tracking_backend()
+        if tracking_backend in ["mlflow", "both"]:
+            import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+            try:
+                if mlflow.active_run():
+                    print("Clearing existing MLflow context before starting new experiment")
+                    mlflow.end_run()
+            except Exception as e:
+                print(f"Error clearing existing MLflow context: {e}")
         # check if experiment is already running
         running_experiment = None
@@ -124,11 +129,18 @@ class ExperimentUtils:
                     given_name,
                     experiments_path,
                 )
-                msg = (
-                    f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
-                    f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
-                    f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-                )
+                if mlflow_experiment_id:
+                    msg = (
+                        f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
+                        f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                        f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                    )
+                else:
+                    msg = (
+                        f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
+                        f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                        f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                    )
                 print(msg)
                 log_messages.append(msg)
         # check if experiment name already exists
@@ -137,11 +149,18 @@ class ExperimentUtils:
                 given_name,
                 experiments_path,
             )
-            msg = (
-                "An experiment with the same name already exists."
-                f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
-                f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-            )
+            if mlflow_experiment_id:
+                msg = (
+                    "An experiment with the same name already exists."
+                    f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                    f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                )
+            else:
+                msg = (
+                    "An experiment with the same name already exists."
+                    f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                    f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                )
             print(msg)
             log_messages.append(msg)
         else:
@@ -149,10 +168,16 @@ class ExperimentUtils:
                 given_name,
                 experiments_path,
             )
-            msg = (
-                f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
-                f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-            )
+            if mlflow_experiment_id:
+                msg = (
+                    f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
+                    f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                )
+            else:
+                msg = (
+                    f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
+                    f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                )
             print(msg)
             log_messages.append(msg)
@@ -185,20 +210,24 @@ class ExperimentUtils:
         self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
         self.db.reset_all_tables()
-        # Clear MLflow context
-        try:
-            if mlflow.active_run():
-                print("Ending active MLflow run before ending experiment")
-                mlflow.end_run()
-            # Also clear context through MLflowManager if available
+        # Clear MLflow context only if using MLflow backend
+        tracking_backend = get_tracking_backend()
+        if tracking_backend in ["mlflow", "both"]:
+            import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+            from rapidfireai.utils.mlflow_manager import MLflowManager
             try:
-                mlflow_manager = MLflowManager(MLFLOW_URL)
-                mlflow_manager.clear_context()
-            except Exception as e2:
-                print(f"[Error clearing MLflow context through MLflowManager: {e2}")
-        except Exception as e:
-            print(f"Error clearing MLflow context: {e}")
+                if mlflow.active_run():
+                    print("Ending active MLflow run before ending experiment")
+                    mlflow.end_run()
+                # Also clear context through MLflowManager if available
+                try:
+                    mlflow_manager = MLflowManager(MLFLOW_URL)
+                    mlflow_manager.clear_context()
+                except Exception as e2:
+                    print(f"Error clearing MLflow context through MLflowManager: {e2}")
+            except Exception as e:
+                print(f"Error clearing MLflow context: {e}")
         # print experiment ended message
         msg = f"Experiment {experiment_name} ended"
@@ -311,28 +340,43 @@ class ExperimentUtils:
             print(f"Error displaying runs info: {e}")
             raise
-    def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
+    def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
         """Create new experiment -
         if given_name already exists - increment suffix and create new experiment
         if given_name is new - create new experiment with given name
+        Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
         """
         try:
             given_name = given_name if given_name else "rf-exp"
             experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
-            mlflow_manager = MLflowManager(MLFLOW_URL)
-            mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
-            mlflow.tracing.disable_notebook_display()
+            # Create MLflow experiment only if using MLflow backend
+            mlflow_experiment_id = None
+            tracking_backend = get_tracking_backend()
+            if tracking_backend in ["mlflow", "both"]:
+                import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+                from rapidfireai.utils.mlflow_manager import MLflowManager
+                try:
+                    mlflow_manager = MLflowManager(MLFLOW_URL)
+                    mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
+                    mlflow.tracing.disable_notebook_display()
+                except Exception as e:
+                    # Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
+                    raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
             # write new experiment details to database
             experiment_id = self.db.create_experiment(
                 experiment_name,
-                mlflow_experiment_id,
+                mlflow_experiment_id,  # Will be None for tensorboard-only
                 config_options={"experiments_path": experiments_path},
             )
             return experiment_id, experiment_name, mlflow_experiment_id
-        except mlflow.exceptions.RestException as e:  # pyright: ignore
-            raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
+        except ExperimentException:
+            # Re-raise ExperimentExceptions (including MLflow errors from above)
+            raise
+        except Exception as e:
+            # Catch any other unexpected errors
+            raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
     def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
         """Increment the suffix of the name after the last '_' till it is unique"""

rapidfireai 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl

Potentially problematic release.

rapidfireai 0.10.2rc4py3-none-any.whl → 0.10.3rc1py3-none-any.whl