rapidfireai 0.10.2rc5__py3-none-any.whl → 0.11.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidfireai might be problematic. Click here for more details.
- rapidfireai/automl/grid_search.py +4 -5
- rapidfireai/automl/model_config.py +41 -37
- rapidfireai/automl/random_search.py +21 -33
- rapidfireai/backend/controller.py +80 -161
- rapidfireai/backend/worker.py +26 -8
- rapidfireai/cli.py +171 -132
- rapidfireai/db/rf_db.py +1 -1
- rapidfireai/db/tables.sql +1 -1
- rapidfireai/dispatcher/dispatcher.py +3 -1
- rapidfireai/dispatcher/gunicorn.conf.py +1 -1
- rapidfireai/experiment.py +86 -7
- rapidfireai/frontend/build/asset-manifest.json +3 -3
- rapidfireai/frontend/build/index.html +1 -1
- rapidfireai/frontend/build/static/js/{main.1bf27639.js → main.58393d31.js} +3 -3
- rapidfireai/frontend/build/static/js/{main.1bf27639.js.map → main.58393d31.js.map} +1 -1
- rapidfireai/frontend/proxy_middleware.py +1 -1
- rapidfireai/ml/callbacks.py +85 -59
- rapidfireai/ml/trainer.py +42 -86
- rapidfireai/start.sh +117 -34
- rapidfireai/utils/constants.py +22 -1
- rapidfireai/utils/experiment_utils.py +87 -43
- rapidfireai/utils/interactive_controller.py +473 -0
- rapidfireai/utils/logging.py +1 -2
- rapidfireai/utils/metric_logger.py +346 -0
- rapidfireai/utils/mlflow_manager.py +0 -1
- rapidfireai/utils/ping.py +4 -2
- rapidfireai/utils/worker_manager.py +16 -6
- rapidfireai/version.py +2 -2
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/METADATA +7 -4
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/RECORD +36 -33
- tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
- /rapidfireai/frontend/build/static/js/{main.1bf27639.js.LICENSE.txt → main.58393d31.js.LICENSE.txt} +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/WHEEL +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/entry_points.txt +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/licenses/LICENSE +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/top_level.txt +0 -0
rapidfireai/start.sh
CHANGED
|
@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
|
|
|
12
12
|
RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
|
|
13
13
|
RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
|
|
14
14
|
# API server configuration - these should match DispatcherConfig in constants.py
|
|
15
|
-
RF_API_PORT=${RF_API_PORT:=
|
|
15
|
+
RF_API_PORT=${RF_API_PORT:=8081}
|
|
16
16
|
RF_API_HOST=${RF_API_HOST:=127.0.0.1}
|
|
17
17
|
|
|
18
18
|
RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
|
|
19
19
|
|
|
20
|
+
# Colab mode configuration
|
|
21
|
+
RF_COLAB_MODE=${RF_COLAB_MODE:=false}
|
|
22
|
+
RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
|
|
23
|
+
|
|
20
24
|
# Colors for output
|
|
21
25
|
RED='\033[0;31m'
|
|
22
26
|
GREEN='\033[0;32m'
|
|
@@ -124,20 +128,15 @@ cleanup() {
|
|
|
124
128
|
rm -f "$RF_PID_FILE"
|
|
125
129
|
fi
|
|
126
130
|
|
|
127
|
-
# Final cleanup -
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
if [[ -n "$remaining_pids" ]]; then
|
|
137
|
-
print_status "Force killing remaining processes on port $port"
|
|
138
|
-
echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
|
|
139
|
-
fi
|
|
140
|
-
done
|
|
131
|
+
# Final cleanup - ONLY if NOT in Colab mode
|
|
132
|
+
# Colab mode skips this to avoid killing Jupyter/IPython infrastructure
|
|
133
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
134
|
+
# Safe, specific patterns for non-Colab environments
|
|
135
|
+
pkill -f "mlflow server" 2>/dev/null || true
|
|
136
|
+
pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
|
|
137
|
+
# Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
|
|
138
|
+
pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
|
|
139
|
+
fi
|
|
141
140
|
|
|
142
141
|
print_success "All services stopped"
|
|
143
142
|
exit 0
|
|
@@ -280,6 +279,10 @@ start_mlflow() {
|
|
|
280
279
|
grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
|
|
281
280
|
fi
|
|
282
281
|
else
|
|
282
|
+
if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
|
|
283
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
284
|
+
return 0
|
|
285
|
+
fi
|
|
283
286
|
print_error "No mlflow.log file found"
|
|
284
287
|
fi
|
|
285
288
|
|
|
@@ -293,6 +296,19 @@ start_mlflow() {
|
|
|
293
296
|
fi
|
|
294
297
|
}
|
|
295
298
|
|
|
299
|
+
# Function to conditionally start MLflow based on mode
|
|
300
|
+
start_mlflow_if_needed() {
|
|
301
|
+
# In Colab mode with pure TensorBoard, skip MLflow
|
|
302
|
+
if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
|
|
303
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
304
|
+
return 0
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
# Otherwise start MLflow
|
|
308
|
+
start_mlflow
|
|
309
|
+
return $?
|
|
310
|
+
}
|
|
311
|
+
|
|
296
312
|
# Function to start API server
|
|
297
313
|
start_api_server() {
|
|
298
314
|
print_status "Starting API server with Gunicorn..."
|
|
@@ -481,6 +497,19 @@ start_frontend() {
|
|
|
481
497
|
return 0
|
|
482
498
|
}
|
|
483
499
|
|
|
500
|
+
# Function to conditionally start frontend based on mode
|
|
501
|
+
start_frontend_if_needed() {
|
|
502
|
+
# In Colab mode, always skip frontend
|
|
503
|
+
if [[ "$RF_COLAB_MODE" == "true" ]]; then
|
|
504
|
+
print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
|
|
505
|
+
return 0
|
|
506
|
+
fi
|
|
507
|
+
|
|
508
|
+
# Otherwise start frontend
|
|
509
|
+
start_frontend
|
|
510
|
+
return $?
|
|
511
|
+
}
|
|
512
|
+
|
|
484
513
|
# Function to display running services
|
|
485
514
|
show_status() {
|
|
486
515
|
print_status "RapidFire AI Services Status:"
|
|
@@ -499,47 +528,101 @@ show_status() {
|
|
|
499
528
|
fi
|
|
500
529
|
|
|
501
530
|
echo ""
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
531
|
+
|
|
532
|
+
# Display appropriate message based on mode
|
|
533
|
+
if [[ "$RF_COLAB_MODE" == "true" ]]; then
|
|
534
|
+
print_success "🚀 RapidFire running in Colab mode!"
|
|
535
|
+
print_status "📊 Use TensorBoard for metrics visualization:"
|
|
536
|
+
print_status " In a Colab notebook cell, run:"
|
|
537
|
+
print_status " %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
|
|
538
|
+
if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
|
|
539
|
+
print_status ""
|
|
540
|
+
print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
|
|
541
|
+
fi
|
|
542
|
+
else
|
|
543
|
+
print_success "🚀 RapidFire Frontend is ready!"
|
|
544
|
+
print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
|
|
545
|
+
print_status " (Click the link above or copy/paste the URL into your browser)"
|
|
546
|
+
fi
|
|
505
547
|
|
|
506
548
|
# Show log file status
|
|
507
549
|
echo ""
|
|
508
550
|
print_status "Log files:"
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
551
|
+
|
|
552
|
+
# Always check api.log
|
|
553
|
+
if [[ -f "$SCRIPT_DIR/api.log" ]]; then
|
|
554
|
+
local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
|
|
555
|
+
print_status "- api.log: $size"
|
|
556
|
+
else
|
|
557
|
+
print_warning "- api.log: not found"
|
|
558
|
+
fi
|
|
559
|
+
|
|
560
|
+
# Only check mlflow.log if MLflow is running
|
|
561
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
562
|
+
if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
|
|
563
|
+
local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
|
|
564
|
+
print_status "- mlflow.log: $size"
|
|
513
565
|
else
|
|
514
|
-
print_warning "-
|
|
566
|
+
print_warning "- mlflow.log: not found"
|
|
515
567
|
fi
|
|
516
|
-
|
|
568
|
+
fi
|
|
569
|
+
|
|
570
|
+
# Only check frontend.log if frontend is running
|
|
571
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
572
|
+
if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
|
|
573
|
+
local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
|
|
574
|
+
print_status "- frontend.log: $size"
|
|
575
|
+
else
|
|
576
|
+
print_warning "- frontend.log: not found"
|
|
577
|
+
fi
|
|
578
|
+
fi
|
|
517
579
|
}
|
|
518
580
|
|
|
519
581
|
# Function to start services based on mode
|
|
520
582
|
start_services() {
|
|
521
583
|
local services_started=0
|
|
522
|
-
local total_services=
|
|
584
|
+
local total_services=1 # API server always runs
|
|
523
585
|
|
|
524
|
-
#
|
|
525
|
-
|
|
526
|
-
|
|
586
|
+
# Calculate total services based on mode
|
|
587
|
+
# MLflow runs unless tensorboard-only in Colab
|
|
588
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
589
|
+
((total_services++))
|
|
590
|
+
fi
|
|
591
|
+
|
|
592
|
+
# Frontend runs unless Colab mode
|
|
593
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
594
|
+
((total_services++))
|
|
595
|
+
fi
|
|
596
|
+
|
|
597
|
+
print_status "Starting $total_services service(s)..."
|
|
598
|
+
|
|
599
|
+
# Start MLflow server (conditionally)
|
|
600
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
601
|
+
if start_mlflow; then
|
|
602
|
+
((services_started++))
|
|
603
|
+
else
|
|
604
|
+
print_error "Failed to start MLflow server"
|
|
605
|
+
fi
|
|
527
606
|
else
|
|
528
|
-
|
|
607
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
529
608
|
fi
|
|
530
609
|
|
|
531
|
-
# Start API server
|
|
610
|
+
# Start API server (always)
|
|
532
611
|
if start_api_server; then
|
|
533
612
|
((services_started++))
|
|
534
613
|
else
|
|
535
614
|
print_error "Failed to start API server"
|
|
536
615
|
fi
|
|
537
616
|
|
|
538
|
-
# Start frontend server
|
|
539
|
-
if
|
|
540
|
-
|
|
617
|
+
# Start frontend server (conditionally)
|
|
618
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
619
|
+
if start_frontend; then
|
|
620
|
+
((services_started++))
|
|
621
|
+
else
|
|
622
|
+
print_error "Failed to start frontend server"
|
|
623
|
+
fi
|
|
541
624
|
else
|
|
542
|
-
|
|
625
|
+
print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
|
|
543
626
|
fi
|
|
544
627
|
|
|
545
628
|
return $((total_services - services_started))
|
rapidfireai/utils/constants.py
CHANGED
|
@@ -1,7 +1,28 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from enum import Enum
|
|
2
3
|
|
|
3
4
|
MLFLOW_URL = "http://127.0.0.1:5002"
|
|
4
5
|
|
|
6
|
+
|
|
7
|
+
# Tracking Backend Configuration
|
|
8
|
+
def get_tracking_backend() -> str:
|
|
9
|
+
"""
|
|
10
|
+
Get the tracking backend from environment variable at runtime.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
str: The tracking backend ('mlflow', 'tensorboard', or 'both')
|
|
14
|
+
|
|
15
|
+
Note: This reads from os.environ at runtime to allow setting the env var
|
|
16
|
+
after module import (important for notebook environments like Colab).
|
|
17
|
+
"""
|
|
18
|
+
backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
|
|
19
|
+
return backend
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Backwards compatibility: Keep constant but it will be stale if env var changes after import
|
|
23
|
+
TRACKING_BACKEND = get_tracking_backend() # Options: 'mlflow', 'tensorboard', 'both'
|
|
24
|
+
TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None) # Default set by experiment path
|
|
25
|
+
|
|
5
26
|
# Shared Memory Constants
|
|
6
27
|
SHM_WARN_THRESHOLD = 80
|
|
7
28
|
SHM_MIN_FREE_SPACE = 1.0
|
|
@@ -24,7 +45,7 @@ class DispatcherConfig:
|
|
|
24
45
|
"""Class to manage the dispatcher configuration"""
|
|
25
46
|
|
|
26
47
|
HOST: str = "127.0.0.1"
|
|
27
|
-
PORT: int =
|
|
48
|
+
PORT: int = 8081
|
|
28
49
|
|
|
29
50
|
|
|
30
51
|
# Database Constants
|
|
@@ -8,7 +8,6 @@ import sys
|
|
|
8
8
|
import warnings
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
import mlflow
|
|
12
11
|
import pandas as pd
|
|
13
12
|
import torch
|
|
14
13
|
from IPython.display import display
|
|
@@ -16,11 +15,13 @@ from tqdm import tqdm
|
|
|
16
15
|
from transformers import logging as transformers_logging
|
|
17
16
|
|
|
18
17
|
from rapidfireai.db.rf_db import RfDb
|
|
19
|
-
from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
|
|
18
|
+
from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
|
|
20
19
|
from rapidfireai.utils.datapaths import DataPath
|
|
21
20
|
from rapidfireai.utils.exceptions import DBException, ExperimentException
|
|
22
21
|
from rapidfireai.utils.logging import RFLogger
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
# Note: mlflow and MLflowManager are imported lazily inside conditional blocks
|
|
24
|
+
# to avoid MLflow connection attempts when using tensorboard-only mode
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class ExperimentUtils:
|
|
@@ -82,12 +83,16 @@ class ExperimentUtils:
|
|
|
82
83
|
self._disable_ml_warnings_display()
|
|
83
84
|
|
|
84
85
|
# Clear any existing MLflow context before starting new experiment
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
# Only if using MLflow backend
|
|
87
|
+
tracking_backend = get_tracking_backend()
|
|
88
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
89
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
90
|
+
try:
|
|
91
|
+
if mlflow.active_run():
|
|
92
|
+
print("Clearing existing MLflow context before starting new experiment")
|
|
93
|
+
mlflow.end_run()
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"Error clearing existing MLflow context: {e}")
|
|
91
96
|
|
|
92
97
|
# check if experiment is already running
|
|
93
98
|
running_experiment = None
|
|
@@ -124,11 +129,18 @@ class ExperimentUtils:
|
|
|
124
129
|
given_name,
|
|
125
130
|
experiments_path,
|
|
126
131
|
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
+
if mlflow_experiment_id:
|
|
133
|
+
msg = (
|
|
134
|
+
f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
|
|
135
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
136
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
msg = (
|
|
140
|
+
f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
|
|
141
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
142
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
143
|
+
)
|
|
132
144
|
print(msg)
|
|
133
145
|
log_messages.append(msg)
|
|
134
146
|
# check if experiment name already exists
|
|
@@ -137,11 +149,18 @@ class ExperimentUtils:
|
|
|
137
149
|
given_name,
|
|
138
150
|
experiments_path,
|
|
139
151
|
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
152
|
+
if mlflow_experiment_id:
|
|
153
|
+
msg = (
|
|
154
|
+
"An experiment with the same name already exists."
|
|
155
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
156
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
msg = (
|
|
160
|
+
"An experiment with the same name already exists."
|
|
161
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
162
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
163
|
+
)
|
|
145
164
|
print(msg)
|
|
146
165
|
log_messages.append(msg)
|
|
147
166
|
else:
|
|
@@ -149,10 +168,16 @@ class ExperimentUtils:
|
|
|
149
168
|
given_name,
|
|
150
169
|
experiments_path,
|
|
151
170
|
)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
171
|
+
if mlflow_experiment_id:
|
|
172
|
+
msg = (
|
|
173
|
+
f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
|
|
174
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
175
|
+
)
|
|
176
|
+
else:
|
|
177
|
+
msg = (
|
|
178
|
+
f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
|
|
179
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
180
|
+
)
|
|
156
181
|
print(msg)
|
|
157
182
|
log_messages.append(msg)
|
|
158
183
|
|
|
@@ -185,20 +210,24 @@ class ExperimentUtils:
|
|
|
185
210
|
self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
|
|
186
211
|
self.db.reset_all_tables()
|
|
187
212
|
|
|
188
|
-
# Clear MLflow context
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# Also clear context through MLflowManager if available
|
|
213
|
+
# Clear MLflow context only if using MLflow backend
|
|
214
|
+
tracking_backend = get_tracking_backend()
|
|
215
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
216
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
217
|
+
from rapidfireai.utils.mlflow_manager import MLflowManager
|
|
195
218
|
try:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
219
|
+
if mlflow.active_run():
|
|
220
|
+
print("Ending active MLflow run before ending experiment")
|
|
221
|
+
mlflow.end_run()
|
|
222
|
+
|
|
223
|
+
# Also clear context through MLflowManager if available
|
|
224
|
+
try:
|
|
225
|
+
mlflow_manager = MLflowManager(MLFLOW_URL)
|
|
226
|
+
mlflow_manager.clear_context()
|
|
227
|
+
except Exception as e2:
|
|
228
|
+
print(f"Error clearing MLflow context through MLflowManager: {e2}")
|
|
229
|
+
except Exception as e:
|
|
230
|
+
print(f"Error clearing MLflow context: {e}")
|
|
202
231
|
|
|
203
232
|
# print experiment ended message
|
|
204
233
|
msg = f"Experiment {experiment_name} ended"
|
|
@@ -311,28 +340,43 @@ class ExperimentUtils:
|
|
|
311
340
|
print(f"Error displaying runs info: {e}")
|
|
312
341
|
raise
|
|
313
342
|
|
|
314
|
-
def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
|
|
343
|
+
def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
|
|
315
344
|
"""Create new experiment -
|
|
316
345
|
if given_name already exists - increment suffix and create new experiment
|
|
317
346
|
if given_name is new - create new experiment with given name
|
|
347
|
+
Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
|
|
318
348
|
"""
|
|
319
349
|
try:
|
|
320
350
|
given_name = given_name if given_name else "rf-exp"
|
|
321
351
|
experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
|
|
322
352
|
|
|
323
|
-
|
|
324
|
-
mlflow_experiment_id =
|
|
325
|
-
|
|
353
|
+
# Create MLflow experiment only if using MLflow backend
|
|
354
|
+
mlflow_experiment_id = None
|
|
355
|
+
tracking_backend = get_tracking_backend()
|
|
356
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
357
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
358
|
+
from rapidfireai.utils.mlflow_manager import MLflowManager
|
|
359
|
+
try:
|
|
360
|
+
mlflow_manager = MLflowManager(MLFLOW_URL)
|
|
361
|
+
mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
|
|
362
|
+
mlflow.tracing.disable_notebook_display()
|
|
363
|
+
except Exception as e:
|
|
364
|
+
# Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
|
|
365
|
+
raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
|
|
326
366
|
|
|
327
367
|
# write new experiment details to database
|
|
328
368
|
experiment_id = self.db.create_experiment(
|
|
329
369
|
experiment_name,
|
|
330
|
-
mlflow_experiment_id,
|
|
370
|
+
mlflow_experiment_id, # Will be None for tensorboard-only
|
|
331
371
|
config_options={"experiments_path": experiments_path},
|
|
332
372
|
)
|
|
333
373
|
return experiment_id, experiment_name, mlflow_experiment_id
|
|
334
|
-
except
|
|
335
|
-
raise
|
|
374
|
+
except ExperimentException:
|
|
375
|
+
# Re-raise ExperimentExceptions (including MLflow errors from above)
|
|
376
|
+
raise
|
|
377
|
+
except Exception as e:
|
|
378
|
+
# Catch any other unexpected errors
|
|
379
|
+
raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
|
|
336
380
|
|
|
337
381
|
def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
|
|
338
382
|
"""Increment the suffix of the name after the last '_' till it is unique"""
|