rapidfireai 0.10.2rc5__py3-none-any.whl → 0.11.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidfireai might be problematic. Click here for more details.

Files changed (36) hide show
  1. rapidfireai/automl/grid_search.py +4 -5
  2. rapidfireai/automl/model_config.py +41 -37
  3. rapidfireai/automl/random_search.py +21 -33
  4. rapidfireai/backend/controller.py +80 -161
  5. rapidfireai/backend/worker.py +26 -8
  6. rapidfireai/cli.py +171 -132
  7. rapidfireai/db/rf_db.py +1 -1
  8. rapidfireai/db/tables.sql +1 -1
  9. rapidfireai/dispatcher/dispatcher.py +3 -1
  10. rapidfireai/dispatcher/gunicorn.conf.py +1 -1
  11. rapidfireai/experiment.py +86 -7
  12. rapidfireai/frontend/build/asset-manifest.json +3 -3
  13. rapidfireai/frontend/build/index.html +1 -1
  14. rapidfireai/frontend/build/static/js/{main.1bf27639.js → main.58393d31.js} +3 -3
  15. rapidfireai/frontend/build/static/js/{main.1bf27639.js.map → main.58393d31.js.map} +1 -1
  16. rapidfireai/frontend/proxy_middleware.py +1 -1
  17. rapidfireai/ml/callbacks.py +85 -59
  18. rapidfireai/ml/trainer.py +42 -86
  19. rapidfireai/start.sh +117 -34
  20. rapidfireai/utils/constants.py +22 -1
  21. rapidfireai/utils/experiment_utils.py +87 -43
  22. rapidfireai/utils/interactive_controller.py +473 -0
  23. rapidfireai/utils/logging.py +1 -2
  24. rapidfireai/utils/metric_logger.py +346 -0
  25. rapidfireai/utils/mlflow_manager.py +0 -1
  26. rapidfireai/utils/ping.py +4 -2
  27. rapidfireai/utils/worker_manager.py +16 -6
  28. rapidfireai/version.py +2 -2
  29. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/METADATA +7 -4
  30. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/RECORD +36 -33
  31. tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
  32. /rapidfireai/frontend/build/static/js/{main.1bf27639.js.LICENSE.txt → main.58393d31.js.LICENSE.txt} +0 -0
  33. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/WHEEL +0 -0
  34. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/entry_points.txt +0 -0
  35. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/licenses/LICENSE +0 -0
  36. {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.11.1rc1.dist-info}/top_level.txt +0 -0
rapidfireai/start.sh CHANGED
@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
12
12
  RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
13
13
  RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
14
14
  # API server configuration - these should match DispatcherConfig in constants.py
15
- RF_API_PORT=${RF_API_PORT:=8080}
15
+ RF_API_PORT=${RF_API_PORT:=8081}
16
16
  RF_API_HOST=${RF_API_HOST:=127.0.0.1}
17
17
 
18
18
  RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
19
19
 
20
+ # Colab mode configuration
21
+ RF_COLAB_MODE=${RF_COLAB_MODE:=false}
22
+ RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
23
+
20
24
  # Colors for output
21
25
  RED='\033[0;31m'
22
26
  GREEN='\033[0;32m'
@@ -124,20 +128,15 @@ cleanup() {
124
128
  rm -f "$RF_PID_FILE"
125
129
  fi
126
130
 
127
- # Final cleanup - kill any remaining MLflow, gunicorn, or Flask processes
128
- pkill -f "mlflow server" 2>/dev/null || true
129
- pkill -f "gunicorn.*rapidfireai" 2>/dev/null || true
130
- pkill -f "python3.*server.py" 2>/dev/null || true
131
- pkill -f "python.*server.py" 2>/dev/null || true
132
-
133
- # Additional cleanup for any remaining processes on our ports
134
- for port in $RF_MLFLOW_PORT $RF_FRONTEND_PORT $RF_API_PORT; do
135
- local remaining_pids=$(lsof -ti :$port 2>/dev/null || true)
136
- if [[ -n "$remaining_pids" ]]; then
137
- print_status "Force killing remaining processes on port $port"
138
- echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
139
- fi
140
- done
131
+ # Final cleanup - ONLY if NOT in Colab mode
132
+ # Colab mode skips this to avoid killing Jupyter/IPython infrastructure
133
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
134
+ # Safe, specific patterns for non-Colab environments
135
+ pkill -f "mlflow server" 2>/dev/null || true
136
+ pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
137
+ # Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
138
+ pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
139
+ fi
141
140
 
142
141
  print_success "All services stopped"
143
142
  exit 0
@@ -280,6 +279,10 @@ start_mlflow() {
280
279
  grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
281
280
  fi
282
281
  else
282
+ if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
283
+ print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
284
+ return 0
285
+ fi
283
286
  print_error "No mlflow.log file found"
284
287
  fi
285
288
 
@@ -293,6 +296,19 @@ start_mlflow() {
293
296
  fi
294
297
  }
295
298
 
299
+ # Function to conditionally start MLflow based on mode
300
+ start_mlflow_if_needed() {
301
+ # In Colab mode with pure TensorBoard, skip MLflow
302
+ if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
303
+ print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
304
+ return 0
305
+ fi
306
+
307
+ # Otherwise start MLflow
308
+ start_mlflow
309
+ return $?
310
+ }
311
+
296
312
  # Function to start API server
297
313
  start_api_server() {
298
314
  print_status "Starting API server with Gunicorn..."
@@ -481,6 +497,19 @@ start_frontend() {
481
497
  return 0
482
498
  }
483
499
 
500
+ # Function to conditionally start frontend based on mode
501
+ start_frontend_if_needed() {
502
+ # In Colab mode, always skip frontend
503
+ if [[ "$RF_COLAB_MODE" == "true" ]]; then
504
+ print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
505
+ return 0
506
+ fi
507
+
508
+ # Otherwise start frontend
509
+ start_frontend
510
+ return $?
511
+ }
512
+
484
513
  # Function to display running services
485
514
  show_status() {
486
515
  print_status "RapidFire AI Services Status:"
@@ -499,47 +528,101 @@ show_status() {
499
528
  fi
500
529
 
501
530
  echo ""
502
- print_success "🚀 RapidFire Frontend is ready!"
503
- print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
504
- print_status " (Click the link above or copy/paste the URL into your browser)"
531
+
532
+ # Display appropriate message based on mode
533
+ if [[ "$RF_COLAB_MODE" == "true" ]]; then
534
+ print_success "🚀 RapidFire running in Colab mode!"
535
+ print_status "📊 Use TensorBoard for metrics visualization:"
536
+ print_status " In a Colab notebook cell, run:"
537
+ print_status " %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
538
+ if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
539
+ print_status ""
540
+ print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
541
+ fi
542
+ else
543
+ print_success "🚀 RapidFire Frontend is ready!"
544
+ print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
545
+ print_status " (Click the link above or copy/paste the URL into your browser)"
546
+ fi
505
547
 
506
548
  # Show log file status
507
549
  echo ""
508
550
  print_status "Log files:"
509
- for log_file in "mlflow.log" "api.log" "frontend.log"; do
510
- if [[ -f "$SCRIPT_DIR/$log_file" ]]; then
511
- local size=$(du -h "$SCRIPT_DIR/$log_file" | cut -f1)
512
- print_status "- $log_file: $size"
551
+
552
+ # Always check api.log
553
+ if [[ -f "$SCRIPT_DIR/api.log" ]]; then
554
+ local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
555
+ print_status "- api.log: $size"
556
+ else
557
+ print_warning "- api.log: not found"
558
+ fi
559
+
560
+ # Only check mlflow.log if MLflow is running
561
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
562
+ if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
563
+ local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
564
+ print_status "- mlflow.log: $size"
513
565
  else
514
- print_warning "- $log_file: not found"
566
+ print_warning "- mlflow.log: not found"
515
567
  fi
516
- done
568
+ fi
569
+
570
+ # Only check frontend.log if frontend is running
571
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
572
+ if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
573
+ local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
574
+ print_status "- frontend.log: $size"
575
+ else
576
+ print_warning "- frontend.log: not found"
577
+ fi
578
+ fi
517
579
  }
518
580
 
519
581
  # Function to start services based on mode
520
582
  start_services() {
521
583
  local services_started=0
522
- local total_services=3
584
+ local total_services=1 # API server always runs
523
585
 
524
- # Start MLflow server
525
- if start_mlflow; then
526
- ((services_started++))
586
+ # Calculate total services based on mode
587
+ # MLflow runs unless tensorboard-only in Colab
588
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
589
+ ((total_services++))
590
+ fi
591
+
592
+ # Frontend runs unless Colab mode
593
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
594
+ ((total_services++))
595
+ fi
596
+
597
+ print_status "Starting $total_services service(s)..."
598
+
599
+ # Start MLflow server (conditionally)
600
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
601
+ if start_mlflow; then
602
+ ((services_started++))
603
+ else
604
+ print_error "Failed to start MLflow server"
605
+ fi
527
606
  else
528
- print_error "Failed to start MLflow server"
607
+ print_status " Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
529
608
  fi
530
609
 
531
- # Start API server
610
+ # Start API server (always)
532
611
  if start_api_server; then
533
612
  ((services_started++))
534
613
  else
535
614
  print_error "Failed to start API server"
536
615
  fi
537
616
 
538
- # Start frontend server
539
- if start_frontend; then
540
- ((services_started++))
617
+ # Start frontend server (conditionally)
618
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
619
+ if start_frontend; then
620
+ ((services_started++))
621
+ else
622
+ print_error "Failed to start frontend server"
623
+ fi
541
624
  else
542
- print_error "Failed to start frontend server"
625
+ print_status " Skipping frontend (using TensorBoard in Colab mode)"
543
626
  fi
544
627
 
545
628
  return $((total_services - services_started))
@@ -1,7 +1,28 @@
1
+ import os
1
2
  from enum import Enum
2
3
 
3
4
  MLFLOW_URL = "http://127.0.0.1:5002"
4
5
 
6
+
7
+ # Tracking Backend Configuration
8
+ def get_tracking_backend() -> str:
9
+ """
10
+ Get the tracking backend from environment variable at runtime.
11
+
12
+ Returns:
13
+ str: The tracking backend ('mlflow', 'tensorboard', or 'both')
14
+
15
+ Note: This reads from os.environ at runtime to allow setting the env var
16
+ after module import (important for notebook environments like Colab).
17
+ """
18
+ backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
19
+ return backend
20
+
21
+
22
+ # Backwards compatibility: Keep constant but it will be stale if env var changes after import
23
+ TRACKING_BACKEND = get_tracking_backend() # Options: 'mlflow', 'tensorboard', 'both'
24
+ TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None) # Default set by experiment path
25
+
5
26
  # Shared Memory Constants
6
27
  SHM_WARN_THRESHOLD = 80
7
28
  SHM_MIN_FREE_SPACE = 1.0
@@ -24,7 +45,7 @@ class DispatcherConfig:
24
45
  """Class to manage the dispatcher configuration"""
25
46
 
26
47
  HOST: str = "127.0.0.1"
27
- PORT: int = 8080
48
+ PORT: int = 8081
28
49
 
29
50
 
30
51
  # Database Constants
@@ -8,7 +8,6 @@ import sys
8
8
  import warnings
9
9
  from typing import Any
10
10
 
11
- import mlflow
12
11
  import pandas as pd
13
12
  import torch
14
13
  from IPython.display import display
@@ -16,11 +15,13 @@ from tqdm import tqdm
16
15
  from transformers import logging as transformers_logging
17
16
 
18
17
  from rapidfireai.db.rf_db import RfDb
19
- from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
18
+ from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
20
19
  from rapidfireai.utils.datapaths import DataPath
21
20
  from rapidfireai.utils.exceptions import DBException, ExperimentException
22
21
  from rapidfireai.utils.logging import RFLogger
23
- from rapidfireai.utils.mlflow_manager import MLflowManager
22
+
23
+ # Note: mlflow and MLflowManager are imported lazily inside conditional blocks
24
+ # to avoid MLflow connection attempts when using tensorboard-only mode
24
25
 
25
26
 
26
27
  class ExperimentUtils:
@@ -82,12 +83,16 @@ class ExperimentUtils:
82
83
  self._disable_ml_warnings_display()
83
84
 
84
85
  # Clear any existing MLflow context before starting new experiment
85
- try:
86
- if mlflow.active_run():
87
- print("Clearing existing MLflow context before starting new experiment")
88
- mlflow.end_run()
89
- except Exception as e:
90
- print(f"Error clearing existing MLflow context: {e}")
86
+ # Only if using MLflow backend
87
+ tracking_backend = get_tracking_backend()
88
+ if tracking_backend in ["mlflow", "both"]:
89
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
90
+ try:
91
+ if mlflow.active_run():
92
+ print("Clearing existing MLflow context before starting new experiment")
93
+ mlflow.end_run()
94
+ except Exception as e:
95
+ print(f"Error clearing existing MLflow context: {e}")
91
96
 
92
97
  # check if experiment is already running
93
98
  running_experiment = None
@@ -124,11 +129,18 @@ class ExperimentUtils:
124
129
  given_name,
125
130
  experiments_path,
126
131
  )
127
- msg = (
128
- f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
129
- f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
130
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
131
- )
132
+ if mlflow_experiment_id:
133
+ msg = (
134
+ f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
135
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
136
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
137
+ )
138
+ else:
139
+ msg = (
140
+ f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
141
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
142
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
143
+ )
132
144
  print(msg)
133
145
  log_messages.append(msg)
134
146
  # check if experiment name already exists
@@ -137,11 +149,18 @@ class ExperimentUtils:
137
149
  given_name,
138
150
  experiments_path,
139
151
  )
140
- msg = (
141
- "An experiment with the same name already exists."
142
- f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
143
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
144
- )
152
+ if mlflow_experiment_id:
153
+ msg = (
154
+ "An experiment with the same name already exists."
155
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
156
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
157
+ )
158
+ else:
159
+ msg = (
160
+ "An experiment with the same name already exists."
161
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
162
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
163
+ )
145
164
  print(msg)
146
165
  log_messages.append(msg)
147
166
  else:
@@ -149,10 +168,16 @@ class ExperimentUtils:
149
168
  given_name,
150
169
  experiments_path,
151
170
  )
152
- msg = (
153
- f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
154
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
155
- )
171
+ if mlflow_experiment_id:
172
+ msg = (
173
+ f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
174
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
175
+ )
176
+ else:
177
+ msg = (
178
+ f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
179
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
180
+ )
156
181
  print(msg)
157
182
  log_messages.append(msg)
158
183
 
@@ -185,20 +210,24 @@ class ExperimentUtils:
185
210
  self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
186
211
  self.db.reset_all_tables()
187
212
 
188
- # Clear MLflow context
189
- try:
190
- if mlflow.active_run():
191
- print("Ending active MLflow run before ending experiment")
192
- mlflow.end_run()
193
-
194
- # Also clear context through MLflowManager if available
213
+ # Clear MLflow context only if using MLflow backend
214
+ tracking_backend = get_tracking_backend()
215
+ if tracking_backend in ["mlflow", "both"]:
216
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
217
+ from rapidfireai.utils.mlflow_manager import MLflowManager
195
218
  try:
196
- mlflow_manager = MLflowManager(MLFLOW_URL)
197
- mlflow_manager.clear_context()
198
- except Exception as e2:
199
- print(f"[Error clearing MLflow context through MLflowManager: {e2}")
200
- except Exception as e:
201
- print(f"Error clearing MLflow context: {e}")
219
+ if mlflow.active_run():
220
+ print("Ending active MLflow run before ending experiment")
221
+ mlflow.end_run()
222
+
223
+ # Also clear context through MLflowManager if available
224
+ try:
225
+ mlflow_manager = MLflowManager(MLFLOW_URL)
226
+ mlflow_manager.clear_context()
227
+ except Exception as e2:
228
+ print(f"Error clearing MLflow context through MLflowManager: {e2}")
229
+ except Exception as e:
230
+ print(f"Error clearing MLflow context: {e}")
202
231
 
203
232
  # print experiment ended message
204
233
  msg = f"Experiment {experiment_name} ended"
@@ -311,28 +340,43 @@ class ExperimentUtils:
311
340
  print(f"Error displaying runs info: {e}")
312
341
  raise
313
342
 
314
- def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
343
+ def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
315
344
  """Create new experiment -
316
345
  if given_name already exists - increment suffix and create new experiment
317
346
  if given_name is new - create new experiment with given name
347
+ Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
318
348
  """
319
349
  try:
320
350
  given_name = given_name if given_name else "rf-exp"
321
351
  experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
322
352
 
323
- mlflow_manager = MLflowManager(MLFLOW_URL)
324
- mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
325
- mlflow.tracing.disable_notebook_display()
353
+ # Create MLflow experiment only if using MLflow backend
354
+ mlflow_experiment_id = None
355
+ tracking_backend = get_tracking_backend()
356
+ if tracking_backend in ["mlflow", "both"]:
357
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
358
+ from rapidfireai.utils.mlflow_manager import MLflowManager
359
+ try:
360
+ mlflow_manager = MLflowManager(MLFLOW_URL)
361
+ mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
362
+ mlflow.tracing.disable_notebook_display()
363
+ except Exception as e:
364
+ # Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
365
+ raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
326
366
 
327
367
  # write new experiment details to database
328
368
  experiment_id = self.db.create_experiment(
329
369
  experiment_name,
330
- mlflow_experiment_id,
370
+ mlflow_experiment_id, # Will be None for tensorboard-only
331
371
  config_options={"experiments_path": experiments_path},
332
372
  )
333
373
  return experiment_id, experiment_name, mlflow_experiment_id
334
- except mlflow.exceptions.RestException as e: # pyright: ignore
335
- raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
374
+ except ExperimentException:
375
+ # Re-raise ExperimentExceptions (including MLflow errors from above)
376
+ raise
377
+ except Exception as e:
378
+ # Catch any other unexpected errors
379
+ raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
336
380
 
337
381
  def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
338
382
  """Increment the suffix of the name after the last '_' till it is unique"""