PyPI - rapidfireai - Versions diffs - 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl - Mend

rapidfireai 0.10.2rc4py3-none-any.whl → 0.10.3rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of rapidfireai might be problematic. Click here for more details.

Files changed (37) hide show

rapidfireai/backend/controller.py +29 -16
rapidfireai/backend/worker.py +14 -7
rapidfireai/cli.py +28 -1
rapidfireai/db/rf_db.py +1 -1
rapidfireai/db/tables.sql +1 -1
rapidfireai/dispatcher/dispatcher.py +3 -1
rapidfireai/dispatcher/gunicorn.conf.py +1 -1
rapidfireai/experiment.py +75 -7
rapidfireai/frontend/build/asset-manifest.json +3 -3
rapidfireai/frontend/build/index.html +1 -1
rapidfireai/frontend/build/static/js/{main.3ff1e37d.js → main.e7d3b759.js} +3 -3
rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.map → main.e7d3b759.js.map} +1 -1
rapidfireai/frontend/proxy_middleware.py +1 -1
rapidfireai/ml/callbacks.py +78 -38
rapidfireai/ml/trainer.py +6 -6
rapidfireai/start.sh +117 -34
rapidfireai/utils/constants.py +20 -1
rapidfireai/utils/experiment_utils.py +87 -43
rapidfireai/utils/interactive_controller.py +494 -0
rapidfireai/utils/metric_logger.py +346 -0
rapidfireai/utils/mlflow_manager.py +0 -2
rapidfireai/utils/worker_manager.py +16 -6
rapidfireai/version.py +2 -2
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +37 -34
tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +6 -6
tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +6 -6
tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +6 -6
tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +6 -6
/rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
{rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0

rapidfireai/frontend/proxy_middleware.py CHANGED Viewed

@@ -25,7 +25,7 @@ class UserProxyManager:
         self.default_proxy = {
             'main_proxy_target': 'http://127.0.0.1:5002/',
             'static_proxy_target': 'http://127.0.0.1:5002/',
-            'dispatcher_proxy_target': 'http://127.0.0.1:8080/',
+            'dispatcher_proxy_target': 'http://127.0.0.1:8081/',
         }
     def get_user_proxy(self, user_id: str) -> Dict[str, str]:

rapidfireai/ml/callbacks.py CHANGED Viewed

@@ -20,7 +20,7 @@ class GenerationMetricsCallback(TrainerCallback):
         generation_config: Optional[Dict] = None,
         compute_metrics: Callable = None,
         batch_size: int = 8,
-        mlflow_manager=None,
+        metric_logger=None,
         mlflow_run_id: str = None,
         completed_steps: int = 0,
     ):
@@ -36,7 +36,7 @@ class GenerationMetricsCallback(TrainerCallback):
             "pad_token_id": tokenizer.pad_token_id,
             "eos_token_id": tokenizer.eos_token_id,
         }
-        self.mlflow_manager = mlflow_manager
+        self.metric_logger = metric_logger
         self.mlflow_run_id = mlflow_run_id
         self.completed_steps = completed_steps
@@ -63,8 +63,8 @@ class GenerationMetricsCallback(TrainerCallback):
             state.log_history.append(metrics)
         for key, value in metrics.items():
-            if self.mlflow_manager:
-                self.mlflow_manager.log_metric(
+            if self.metric_logger:
+                self.metric_logger.log_metric(
                     self.mlflow_run_id,
                     key,
                     value,
@@ -72,41 +72,69 @@ class GenerationMetricsCallback(TrainerCallback):
                 )
     def _prepare_data(self, eval_dataset: Dataset) -> tuple:
-        """Prepare batch data for generation"""
+        """Prepare batch data for generation with defensive validation"""
         input_texts = []
         references = []
         for item in eval_dataset:
-            if isinstance(item, dict):
-                if "input" in item and "output" in item:
-                    input_text = item["input"]
-                    reference = item["output"]
-                elif "prompt" in item and "completion" in item:
-                    input_text = item["prompt"]
-                    reference = item["completion"][-1]["content"]
-                    input_text = self.tokenizer.apply_chat_template(
-                        input_text, tokenize=False
-                    )
-                else:
-                    continue
-                input_texts.append(input_text)
-                references.append(reference)
+            if not isinstance(item, dict):
+                continue
+            input_text = None
+            reference = None
+            # Support multiple field name patterns
+            if "input" in item and "output" in item:
+                input_text = item["input"]
+                reference = item["output"]
+            elif "prompt" in item and "completion" in item:
+                input_text = item["prompt"]
+                reference = item["completion"][-1]["content"]
+                input_text = self.tokenizer.apply_chat_template(
+                    input_text, tokenize=False
+                )
+            elif "text" in item:
+                # SFT format - use text as input, response as reference
+                input_text = item["text"]
+                reference = item.get("response", item.get("instruction", item["text"]))
+            elif "instruction" in item and "response" in item:
+                # Direct instruction/response format
+                input_text = item["instruction"]
+                reference = item["response"]
+            # Validate non-empty strings
+            if input_text and isinstance(input_text, str) and input_text.strip():
+                if reference and isinstance(reference, str) and reference.strip():
+                    input_texts.append(input_text.strip())
+                    references.append(reference.strip())
+        # Return safe empty values to prevent downstream errors
+        if not input_texts:
+            return [], []
         return input_texts, references
-    def _generate_batch(self, model, input_texts: List[str]) -> List[str]:
-        """Generate text for a batch of inputs"""
-        # Tokenize batch
-        inputs = self.tokenizer(
-            input_texts,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512,  # Adjust based on your model's context length
-        ).to(model.device)
+    def _generate_batch(self, model, input_texts: List[str]) -> torch.Tensor:
+        """Generate text for a batch of inputs with defensive validation"""
+        # Defensive validation for empty inputs
+        if not input_texts:
+            return torch.empty((0, 0), dtype=torch.long).to(model.device)
-        return inputs["input_ids"]
+        try:
+            # Tokenize batch
+            inputs = self.tokenizer(
+                input_texts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512,  # Adjust based on your model's context length
+            ).to(model.device)
+            return inputs["input_ids"]
+        except Exception as e:
+            # Log error and return empty tensor to prevent crash
+            print(f"Warning: Tokenization error in generation callback: {e}")
+            return torch.empty((0, 0), dtype=torch.long).to(model.device)
     def _compute_generation_metrics(self, model, step: int) -> Dict[str, float]:
         """Generate text and compute BLEU/ROUGE metrics with batch processing"""
@@ -121,7 +149,19 @@ class GenerationMetricsCallback(TrainerCallback):
         # Process in batches
         input_texts, batch_references = self._prepare_data(self.eval_dataset)
+        # Early return if no valid data
+        if not input_texts:
+            print("Warning: No valid eval data for generation metrics")
+            return {}
         input_ids = self._generate_batch(model, input_texts)
+        # Check for empty generation batch
+        if input_ids.numel() == 0:
+            print("Warning: Empty input_ids from tokenization")
+            return {}
         with torch.no_grad():
             for i in tqdm(
                 range(0, len(indices), self.batch_size), desc="Generating for metrics"
@@ -155,18 +195,18 @@ class GenerationMetricsCallback(TrainerCallback):
 class MLflowLoggingCallback(TrainerCallback):
-    """Callback for logging metrics to MLflow during training"""
+    """Callback for logging metrics to tracking backend during training"""
     def __init__(
         self,
-        mlflow_manager,
+        metric_logger,
         mlflow_run_id: str,
         excluded_keys: list = None,
         completed_steps: int = 0,
         chunk_id: int = 0,
         num_epochs_completed: int = 0,
     ):
-        self.mlflow_manager = mlflow_manager
+        self.metric_logger = metric_logger
         self.mlflow_run_id = mlflow_run_id
         self.completed_steps = completed_steps
         self.excluded_keys = excluded_keys or [
@@ -189,22 +229,22 @@ class MLflowLoggingCallback(TrainerCallback):
             for key, value in logs.items():
                 if isinstance(value, (int, float)) and key not in self.excluded_keys:
                     try:
-                        self.mlflow_manager.log_metric(
+                        self.metric_logger.log_metric(
                             self.mlflow_run_id,
                             key,
                             value,
                             step=self.completed_steps + state.global_step,
                         )
                     except Exception as e:
-                        print(f"Warning: Failed to log metric {key} to MLflow: {e}")
+                        print(f"Warning: Failed to log metric {key} to tracking backend: {e}")
             if "eval_loss" not in logs and "train_runtime" not in logs:
-                self.mlflow_manager.log_metric(
+                self.metric_logger.log_metric(
                     self.mlflow_run_id,
                     "chunk number",
                     self.chunk_id,
                     step=self.completed_steps + state.global_step,
                 )
-                self.mlflow_manager.log_metric(
+                self.metric_logger.log_metric(
                     self.mlflow_run_id,
                     "num_epochs_completed",
                     self.num_epochs_completed,

rapidfireai/ml/trainer.py CHANGED Viewed

@@ -34,7 +34,7 @@ def create_trainer_instance(
     trainer_config: TrainerConfig,
     shm_manager: SharedMemoryManager,
     use_shared_memory: bool = False,
-    mlflow_manager=None,
+    metric_logger=None,
     chunk_id: int = 0,
 ) -> tuple[SFTTrainer | DPOTrainer | GRPOTrainer | None, str]:
     """
@@ -98,7 +98,7 @@ def create_trainer_instance(
     callbacks, additional_trainer_kwargs = (
         _setup_callbacks(  # FIXME: avoid returning additional_trainer_kwargs
-            mlflow_manager,
+            metric_logger,
             trainer_config,
             chunk_id,
             compute_metrics,
@@ -314,7 +314,7 @@ def _prepare_trainer_kwargs(
 def _setup_callbacks(
-    mlflow_manager,
+    metric_logger,
     trainer_config,
     chunk_id,
     compute_metrics,
@@ -327,9 +327,9 @@ def _setup_callbacks(
     """Setup callbacks for the trainer."""
     callbacks = []
-    if mlflow_manager is not None and trainer_config.mlflow_run_id is not None:
+    if metric_logger is not None and trainer_config.mlflow_run_id is not None:
         mlflow_callback = MLflowLoggingCallback(
-            mlflow_manager=mlflow_manager,
+            metric_logger=metric_logger,
             mlflow_run_id=trainer_config.mlflow_run_id,
             completed_steps=trainer_config.completed_steps,
             chunk_id=chunk_id,
@@ -353,7 +353,7 @@ def _setup_callbacks(
             generation_config=additional_trainer_kwargs.get("generation_config"),
             compute_metrics=compute_metrics_function,
             batch_size=training_args.get("per_device_eval_batch_size"),
-            mlflow_manager=mlflow_manager,
+            metric_logger=metric_logger,
             mlflow_run_id=trainer_config.mlflow_run_id,
             completed_steps=trainer_config.completed_steps,
         )

rapidfireai/start.sh CHANGED Viewed

@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
 RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
 RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
 # API server configuration - these should match DispatcherConfig in constants.py
-RF_API_PORT=${RF_API_PORT:=8080}
+RF_API_PORT=${RF_API_PORT:=8081}
 RF_API_HOST=${RF_API_HOST:=127.0.0.1}
 RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
+# Colab mode configuration
+RF_COLAB_MODE=${RF_COLAB_MODE:=false}
+RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -124,20 +128,15 @@ cleanup() {
         rm -f "$RF_PID_FILE"
     fi
-    # Final cleanup - kill any remaining MLflow, gunicorn, or Flask processes
-    pkill -f "mlflow server" 2>/dev/null || true
-    pkill -f "gunicorn.*rapidfireai" 2>/dev/null || true
-    pkill -f "python3.*server.py" 2>/dev/null || true
-    pkill -f "python.*server.py" 2>/dev/null || true
-    # Additional cleanup for any remaining processes on our ports
-    for port in $RF_MLFLOW_PORT $RF_FRONTEND_PORT $RF_API_PORT; do
-        local remaining_pids=$(lsof -ti :$port 2>/dev/null || true)
-        if [[ -n "$remaining_pids" ]]; then
-            print_status "Force killing remaining processes on port $port"
-            echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
-        fi
-    done
+    # Final cleanup - ONLY if NOT in Colab mode
+    # Colab mode skips this to avoid killing Jupyter/IPython infrastructure
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        # Safe, specific patterns for non-Colab environments
+        pkill -f "mlflow server" 2>/dev/null || true
+        pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
+        # Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
+        pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
+    fi
     print_success "All services stopped"
     exit 0
@@ -280,6 +279,10 @@ start_mlflow() {
                 grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
             fi
         else
+            if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
+                print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
+                return 0
+            fi
             print_error "No mlflow.log file found"
         fi
@@ -293,6 +296,19 @@ start_mlflow() {
     fi
 }
+# Function to conditionally start MLflow based on mode
+start_mlflow_if_needed() {
+    # In Colab mode with pure TensorBoard, skip MLflow
+    if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
+        print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
+        return 0
+    fi
+    # Otherwise start MLflow
+    start_mlflow
+    return $?
+}
 # Function to start API server
 start_api_server() {
     print_status "Starting API server with Gunicorn..."
@@ -481,6 +497,19 @@ start_frontend() {
     return 0
 }
+# Function to conditionally start frontend based on mode
+start_frontend_if_needed() {
+    # In Colab mode, always skip frontend
+    if [[ "$RF_COLAB_MODE" == "true" ]]; then
+        print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
+        return 0
+    fi
+    # Otherwise start frontend
+    start_frontend
+    return $?
+}
 # Function to display running services
 show_status() {
     print_status "RapidFire AI Services Status:"
@@ -499,47 +528,101 @@ show_status() {
     fi
     echo ""
-    print_success "🚀 RapidFire Frontend is ready!"
-    print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
-    print_status "   (Click the link above or copy/paste the URL into your browser)"
+    # Display appropriate message based on mode
+    if [[ "$RF_COLAB_MODE" == "true" ]]; then
+        print_success "🚀 RapidFire running in Colab mode!"
+        print_status "📊 Use TensorBoard for metrics visualization:"
+        print_status "   In a Colab notebook cell, run:"
+        print_status "   %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
+        if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
+            print_status ""
+            print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
+        fi
+    else
+        print_success "🚀 RapidFire Frontend is ready!"
+        print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
+        print_status "   (Click the link above or copy/paste the URL into your browser)"
+    fi
     # Show log file status
     echo ""
     print_status "Log files:"
-    for log_file in "mlflow.log" "api.log" "frontend.log"; do
-        if [[ -f "$SCRIPT_DIR/$log_file" ]]; then
-            local size=$(du -h "$SCRIPT_DIR/$log_file" | cut -f1)
-            print_status "- $log_file: $size"
+    # Always check api.log
+    if [[ -f "$SCRIPT_DIR/api.log" ]]; then
+        local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
+        print_status "- api.log: $size"
+    else
+        print_warning "- api.log: not found"
+    fi
+    # Only check mlflow.log if MLflow is running
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
+            local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
+            print_status "- mlflow.log: $size"
         else
-            print_warning "- $log_file: not found"
+            print_warning "- mlflow.log: not found"
         fi
-    done
+    fi
+    # Only check frontend.log if frontend is running
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
+            local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
+            print_status "- frontend.log: $size"
+        else
+            print_warning "- frontend.log: not found"
+        fi
+    fi
 }
 # Function to start services based on mode
 start_services() {
     local services_started=0
-    local total_services=3
+    local total_services=1  # API server always runs
-    # Start MLflow server
-    if start_mlflow; then
-        ((services_started++))
+    # Calculate total services based on mode
+    # MLflow runs unless tensorboard-only in Colab
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        ((total_services++))
+    fi
+    # Frontend runs unless Colab mode
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        ((total_services++))
+    fi
+    print_status "Starting $total_services service(s)..."
+    # Start MLflow server (conditionally)
+    if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
+        if start_mlflow; then
+            ((services_started++))
+        else
+            print_error "Failed to start MLflow server"
+        fi
     else
-        print_error "Failed to start MLflow server"
+        print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
     fi
-    # Start API server
+    # Start API server (always)
     if start_api_server; then
         ((services_started++))
     else
         print_error "Failed to start API server"
     fi
-    # Start frontend server
-    if start_frontend; then
-        ((services_started++))
+    # Start frontend server (conditionally)
+    if [[ "$RF_COLAB_MODE" != "true" ]]; then
+        if start_frontend; then
+            ((services_started++))
+        else
+            print_error "Failed to start frontend server"
+        fi
     else
-        print_error "Failed to start frontend server"
+        print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
     fi
     return $((total_services - services_started))

rapidfireai/utils/constants.py CHANGED Viewed

@@ -1,7 +1,26 @@
 from enum import Enum
+import os
 MLFLOW_URL = "http://127.0.0.1:5002"
+# Tracking Backend Configuration
+def get_tracking_backend() -> str:
+    """
+    Get the tracking backend from environment variable at runtime.
+    Returns:
+        str: The tracking backend ('mlflow', 'tensorboard', or 'both')
+    Note: This reads from os.environ at runtime to allow setting the env var
+    after module import (important for notebook environments like Colab).
+    """
+    backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
+    return backend
+# Backwards compatibility: Keep constant but it will be stale if env var changes after import
+TRACKING_BACKEND = get_tracking_backend()  # Options: 'mlflow', 'tensorboard', 'both'
+TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None)  # Default set by experiment path
 # Shared Memory Constants
 SHM_WARN_THRESHOLD = 80
 SHM_MIN_FREE_SPACE = 1.0
@@ -24,7 +43,7 @@ class DispatcherConfig:
     """Class to manage the dispatcher configuration"""
     HOST: str = "127.0.0.1"
-    PORT: int = 8080
+    PORT: int = 8081
 # Database Constants

rapidfireai 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl

Potentially problematic release.

rapidfireai 0.10.2rc4py3-none-any.whl → 0.10.3rc1py3-none-any.whl