PyPI - rapidfireai - Versions diffs - 0.10.2rc5__py3-none-any.whl → 0.11.1rc1__py3-none-any.whl - Mend

rapidfireai 0.10.2rc5py3-none-any.whl → 0.11.1rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidfireai might be problematic. Click here for more details.

Files changed (36) hide show

rapidfireai/automl/grid_search.py +4 -5
rapidfireai/automl/model_config.py +41 -37
rapidfireai/automl/random_search.py +21 -33
rapidfireai/backend/controller.py +80 -161
rapidfireai/backend/worker.py +26 -8
rapidfireai/cli.py +171 -132
rapidfireai/db/rf_db.py +1 -1
rapidfireai/db/tables.sql +1 -1
rapidfireai/dispatcher/dispatcher.py +3 -1
rapidfireai/dispatcher/gunicorn.conf.py +1 -1
rapidfireai/experiment.py +86 -7
rapidfireai/frontend/build/asset-manifest.json +3 -3
rapidfireai/frontend/build/index.html +1 -1
rapidfireai/frontend/build/static/js/{main.1bf27639.js → main.58393d31.js} +3 -3
rapidfireai/frontend/build/static/js/{main.1bf27639.js.map → main.58393d31.js.map} +1 -1
rapidfireai/frontend/proxy_middleware.py +1 -1
rapidfireai/ml/callbacks.py +85 -59
rapidfireai/ml/trainer.py +42 -86
rapidfireai/start.sh +117 -34
rapidfireai/utils/constants.py +22 -1
rapidfireai/utils/experiment_utils.py +87 -43
rapidfireai/utils/interactive_controller.py +473 -0
rapidfireai/utils/logging.py +1 -2
rapidfireai/utils/metric_logger.py +346 -0
rapidfireai/utils/mlflow_manager.py +0 -1
rapidfireai/utils/ping.py +4 -2
rapidfireai/utils/worker_manager.py +16 -6
rapidfireai/version.py +2 -2
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/METADATA +7 -4
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/RECORD +36 -33
tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
/rapidfireai/frontend/build/static/js/{main.1bf27639.js.LICENSE.txt → main.58393d31.js.LICENSE.txt} +0 -0
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/WHEEL +0 -0
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/entry_points.txt +0 -0
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/licenses/LICENSE +0 -0
{rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/top_level.txt +0 -0

rapidfireai/start.sh CHANGED Viewed

@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
 RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
 RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
 # API server configuration - these should match DispatcherConfig in constants.py
-RF_API_PORT=${RF_API_PORT:=8080}
+RF_API_PORT=${RF_API_PORT:=8081}
 RF_API_HOST=${RF_API_HOST:=127.0.0.1}
 RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
+# Colab mode configuration
+RF_COLAB_MODE=${RF_COLAB_MODE:=false}
+RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -124,20 +128,15 @@ cleanup() {
         rm -f "$RF_PID_FILE"
     fi
-    # Final cleanup - kill any remaining MLflow, gunicorn, or Flask processes
-    pkill -f "mlflow server" 2>/dev/null || true
-    pkill -f "gunicorn.*rapidfireai" 2>/dev/null || true
-    pkill -f "python3.*server.py" 2>/dev/null || true
-    pkill -f "python.*server.py" 2>/dev/null || true
-    # Additional cleanup for any remaining processes on our ports
-    for port in $RF_MLFLOW_PORT $RF_FRONTEND_PORT $RF_API_PORT; do
-        local remaining_pids=$(lsof -ti :$port 2>/dev/null || true)
-        if [[ -n "$remaining_pids" ]]; then
-            print_status "Force killing remaining processes on port $port"
-            echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
-        fi
-    done
+    # Final cleanup - ONLY if NOT in Colab mode
+    # Colab mode skips this to avoid killing Jupyter/IPython infrastructure
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        # Safe, specific patterns for non-Colab environments
+        pkill -f "mlflow server" 2>/dev/null || true
+        pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
+        # Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
+        pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
+    fi
     print_success "All services stopped"
     exit 0
@@ -280,6 +279,10 @@ start_mlflow() {
                 grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
             fi
         else
+            if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
+                print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
+                return 0
+            fi
             print_error "No mlflow.log file found"
         fi
@@ -293,6 +296,19 @@ start_mlflow() {
     fi
 }
+# Function to conditionally start MLflow based on mode
+start_mlflow_if_needed() {
+    # In Colab mode with pure TensorBoard, skip MLflow
+    if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
+        print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
+        return 0
+    fi
+    # Otherwise start MLflow
+    start_mlflow
+    return $?
+}
 # Function to start API server
 start_api_server() {
     print_status "Starting API server with Gunicorn..."
@@ -481,6 +497,19 @@ start_frontend() {
     return 0
 }
+# Function to conditionally start frontend based on mode
+start_frontend_if_needed() {
+    # In Colab mode, always skip frontend
+    if [[ "$RF_COLAB_MODE" == "true" ]]; then
+        print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
+        return 0
+    fi
+    # Otherwise start frontend
+    start_frontend
+    return $?
+}
 # Function to display running services
 show_status() {
     print_status "RapidFire AI Services Status:"
@@ -499,47 +528,101 @@ show_status() {
     fi
     echo ""
-    print_success "🚀 RapidFire Frontend is ready!"
-    print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
-    print_status "   (Click the link above or copy/paste the URL into your browser)"
+    # Display appropriate message based on mode
+    if [[ "$RF_COLAB_MODE" == "true" ]]; then
+        print_success "🚀 RapidFire running in Colab mode!"
+        print_status "📊 Use TensorBoard for metrics visualization:"
+        print_status "   In a Colab notebook cell, run:"
+        print_status "   %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
+        if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
+            print_status ""
+            print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
+        fi
+    else
+        print_success "🚀 RapidFire Frontend is ready!"
+        print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
+        print_status "   (Click the link above or copy/paste the URL into your browser)"
+    fi
     # Show log file status
     echo ""
     print_status "Log files:"
-    for log_file in "mlflow.log" "api.log" "frontend.log"; do
-        if [[ -f "$SCRIPT_DIR/$log_file" ]]; then
-            local size=$(du -h "$SCRIPT_DIR/$log_file" | cut -f1)
-            print_status "- $log_file: $size"
+    # Always check api.log
+    if [[ -f "$SCRIPT_DIR/api.log" ]]; then
+        local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
+        print_status "- api.log: $size"
+    else
+        print_warning "- api.log: not found"
+    fi
+    # Only check mlflow.log if MLflow is running
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
+            local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
+            print_status "- mlflow.log: $size"
         else
-            print_warning "- $log_file: not found"
+            print_warning "- mlflow.log: not found"
         fi
-    done
+    fi
+    # Only check frontend.log if frontend is running
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
+            local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
+            print_status "- frontend.log: $size"
+        else
+            print_warning "- frontend.log: not found"
+        fi
+    fi
 }
 # Function to start services based on mode
 start_services() {
     local services_started=0
-    local total_services=3
+    local total_services=1  # API server always runs
-    # Start MLflow server
-    if start_mlflow; then
-        ((services_started++))
+    # Calculate total services based on mode
+    # MLflow runs unless tensorboard-only in Colab
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        ((total_services++))
+    fi
+    # Frontend runs unless Colab mode
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        ((total_services++))
+    fi
+    print_status "Starting $total_services service(s)..."
+    # Start MLflow server (conditionally)
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        if start_mlflow; then
+            ((services_started++))
+        else
+            print_error "Failed to start MLflow server"
+        fi
     else
-        print_error "Failed to start MLflow server"
+        print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
     fi
-    # Start API server
+    # Start API server (always)
     if start_api_server; then
         ((services_started++))
     else
         print_error "Failed to start API server"
     fi
-    # Start frontend server
-    if start_frontend; then
-        ((services_started++))
+    # Start frontend server (conditionally)
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        if start_frontend; then
+            ((services_started++))
+        else
+            print_error "Failed to start frontend server"
+        fi
     else
-        print_error "Failed to start frontend server"
+        print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
     fi
     return $((total_services - services_started))

rapidfireai/utils/constants.py CHANGED Viewed

@@ -1,7 +1,28 @@
+import os
 from enum import Enum
 MLFLOW_URL = "http://127.0.0.1:5002"
+# Tracking Backend Configuration
+def get_tracking_backend() -> str:
+    """
+    Get the tracking backend from environment variable at runtime.
+    Returns:
+        str: The tracking backend ('mlflow', 'tensorboard', or 'both')
+    Note: This reads from os.environ at runtime to allow setting the env var
+    after module import (important for notebook environments like Colab).
+    """
+    backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
+    return backend
+# Backwards compatibility: Keep constant but it will be stale if env var changes after import
+TRACKING_BACKEND = get_tracking_backend()  # Options: 'mlflow', 'tensorboard', 'both'
+TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None)  # Default set by experiment path
 # Shared Memory Constants
 SHM_WARN_THRESHOLD = 80
 SHM_MIN_FREE_SPACE = 1.0
@@ -24,7 +45,7 @@ class DispatcherConfig:
     """Class to manage the dispatcher configuration"""
     HOST: str = "127.0.0.1"
-    PORT: int = 8080
+    PORT: int = 8081
 # Database Constants

rapidfireai/utils/experiment_utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ import sys
 import warnings
 from typing import Any
-import mlflow
 import pandas as pd
 import torch
 from IPython.display import display
@@ -16,11 +15,13 @@ from tqdm import tqdm
 from transformers import logging as transformers_logging
 from rapidfireai.db.rf_db import RfDb
-from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
+from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
 from rapidfireai.utils.datapaths import DataPath
 from rapidfireai.utils.exceptions import DBException, ExperimentException
 from rapidfireai.utils.logging import RFLogger
-from rapidfireai.utils.mlflow_manager import MLflowManager
+# Note: mlflow and MLflowManager are imported lazily inside conditional blocks
+# to avoid MLflow connection attempts when using tensorboard-only mode
 class ExperimentUtils:
@@ -82,12 +83,16 @@ class ExperimentUtils:
         self._disable_ml_warnings_display()
         # Clear any existing MLflow context before starting new experiment
-        try:
-            if mlflow.active_run():
-                print("Clearing existing MLflow context before starting new experiment")
-                mlflow.end_run()
-        except Exception as e:
-            print(f"Error clearing existing MLflow context: {e}")
+        # Only if using MLflow backend
+        tracking_backend = get_tracking_backend()
+        if tracking_backend in ["mlflow", "both"]:
+            import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+            try:
+                if mlflow.active_run():
+                    print("Clearing existing MLflow context before starting new experiment")
+                    mlflow.end_run()
+            except Exception as e:
+                print(f"Error clearing existing MLflow context: {e}")
         # check if experiment is already running
         running_experiment = None
@@ -124,11 +129,18 @@ class ExperimentUtils:
                     given_name,
                     experiments_path,
                 )
-                msg = (
-                    f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
-                    f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
-                    f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-                )
+                if mlflow_experiment_id:
+                    msg = (
+                        f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
+                        f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                        f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                    )
+                else:
+                    msg = (
+                        f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
+                        f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                        f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                    )
                 print(msg)
                 log_messages.append(msg)
         # check if experiment name already exists
@@ -137,11 +149,18 @@ class ExperimentUtils:
                 given_name,
                 experiments_path,
             )
-            msg = (
-                "An experiment with the same name already exists."
-                f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
-                f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-            )
+            if mlflow_experiment_id:
+                msg = (
+                    "An experiment with the same name already exists."
+                    f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                    f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                )
+            else:
+                msg = (
+                    "An experiment with the same name already exists."
+                    f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
+                    f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                )
             print(msg)
             log_messages.append(msg)
         else:
@@ -149,10 +168,16 @@ class ExperimentUtils:
                 given_name,
                 experiments_path,
             )
-            msg = (
-                f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
-                f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
-            )
+            if mlflow_experiment_id:
+                msg = (
+                    f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
+                    f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
+                )
+            else:
+                msg = (
+                    f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
+                    f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
+                )
             print(msg)
             log_messages.append(msg)
@@ -185,20 +210,24 @@ class ExperimentUtils:
         self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
         self.db.reset_all_tables()
-        # Clear MLflow context
-        try:
-            if mlflow.active_run():
-                print("Ending active MLflow run before ending experiment")
-                mlflow.end_run()
-            # Also clear context through MLflowManager if available
+        # Clear MLflow context only if using MLflow backend
+        tracking_backend = get_tracking_backend()
+        if tracking_backend in ["mlflow", "both"]:
+            import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+            from rapidfireai.utils.mlflow_manager import MLflowManager
             try:
-                mlflow_manager = MLflowManager(MLFLOW_URL)
-                mlflow_manager.clear_context()
-            except Exception as e2:
-                print(f"[Error clearing MLflow context through MLflowManager: {e2}")
-        except Exception as e:
-            print(f"Error clearing MLflow context: {e}")
+                if mlflow.active_run():
+                    print("Ending active MLflow run before ending experiment")
+                    mlflow.end_run()
+                # Also clear context through MLflowManager if available
+                try:
+                    mlflow_manager = MLflowManager(MLFLOW_URL)
+                    mlflow_manager.clear_context()
+                except Exception as e2:
+                    print(f"Error clearing MLflow context through MLflowManager: {e2}")
+            except Exception as e:
+                print(f"Error clearing MLflow context: {e}")
         # print experiment ended message
         msg = f"Experiment {experiment_name} ended"
@@ -311,28 +340,43 @@ class ExperimentUtils:
             print(f"Error displaying runs info: {e}")
             raise
-    def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
+    def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
         """Create new experiment -
         if given_name already exists - increment suffix and create new experiment
         if given_name is new - create new experiment with given name
+        Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
         """
         try:
             given_name = given_name if given_name else "rf-exp"
             experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
-            mlflow_manager = MLflowManager(MLFLOW_URL)
-            mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
-            mlflow.tracing.disable_notebook_display()
+            # Create MLflow experiment only if using MLflow backend
+            mlflow_experiment_id = None
+            tracking_backend = get_tracking_backend()
+            if tracking_backend in ["mlflow", "both"]:
+                import mlflow  # Lazy import to avoid connection attempts in tensorboard-only mode
+                from rapidfireai.utils.mlflow_manager import MLflowManager
+                try:
+                    mlflow_manager = MLflowManager(MLFLOW_URL)
+                    mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
+                    mlflow.tracing.disable_notebook_display()
+                except Exception as e:
+                    # Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
+                    raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
             # write new experiment details to database
             experiment_id = self.db.create_experiment(
                 experiment_name,
-                mlflow_experiment_id,
+                mlflow_experiment_id,  # Will be None for tensorboard-only
                 config_options={"experiments_path": experiments_path},
             )
             return experiment_id, experiment_name, mlflow_experiment_id
-        except mlflow.exceptions.RestException as e:  # pyright: ignore
-            raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
+        except ExperimentException:
+            # Re-raise ExperimentExceptions (including MLflow errors from above)
+            raise
+        except Exception as e:
+            # Catch any other unexpected errors
+            raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
     def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
         """Increment the suffix of the name after the last '_' till it is unique"""

rapidfireai 0.10.2rc5__py3-none-any.whl → 0.11.1rc1__py3-none-any.whl

Potentially problematic release.

rapidfireai 0.10.2rc5py3-none-any.whl → 0.11.1rc1py3-none-any.whl