rapidfireai 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidfireai might be problematic. Click here for more details.

Files changed (37) hide show
  1. rapidfireai/backend/controller.py +29 -16
  2. rapidfireai/backend/worker.py +14 -7
  3. rapidfireai/cli.py +28 -1
  4. rapidfireai/db/rf_db.py +1 -1
  5. rapidfireai/db/tables.sql +1 -1
  6. rapidfireai/dispatcher/dispatcher.py +3 -1
  7. rapidfireai/dispatcher/gunicorn.conf.py +1 -1
  8. rapidfireai/experiment.py +75 -7
  9. rapidfireai/frontend/build/asset-manifest.json +3 -3
  10. rapidfireai/frontend/build/index.html +1 -1
  11. rapidfireai/frontend/build/static/js/{main.3ff1e37d.js → main.e7d3b759.js} +3 -3
  12. rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.map → main.e7d3b759.js.map} +1 -1
  13. rapidfireai/frontend/proxy_middleware.py +1 -1
  14. rapidfireai/ml/callbacks.py +78 -38
  15. rapidfireai/ml/trainer.py +6 -6
  16. rapidfireai/start.sh +117 -34
  17. rapidfireai/utils/constants.py +20 -1
  18. rapidfireai/utils/experiment_utils.py +87 -43
  19. rapidfireai/utils/interactive_controller.py +494 -0
  20. rapidfireai/utils/metric_logger.py +346 -0
  21. rapidfireai/utils/mlflow_manager.py +0 -2
  22. rapidfireai/utils/worker_manager.py +16 -6
  23. rapidfireai/version.py +2 -2
  24. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
  25. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +37 -34
  26. tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
  27. tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +6 -6
  28. tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +6 -6
  29. tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +6 -6
  30. tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +6 -6
  31. tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +6 -6
  32. tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +6 -6
  33. /rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
  34. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
  35. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
  36. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
  37. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ class UserProxyManager:
25
25
  self.default_proxy = {
26
26
  'main_proxy_target': 'http://127.0.0.1:5002/',
27
27
  'static_proxy_target': 'http://127.0.0.1:5002/',
28
- 'dispatcher_proxy_target': 'http://127.0.0.1:8080/',
28
+ 'dispatcher_proxy_target': 'http://127.0.0.1:8081/',
29
29
  }
30
30
 
31
31
  def get_user_proxy(self, user_id: str) -> Dict[str, str]:
@@ -20,7 +20,7 @@ class GenerationMetricsCallback(TrainerCallback):
20
20
  generation_config: Optional[Dict] = None,
21
21
  compute_metrics: Callable = None,
22
22
  batch_size: int = 8,
23
- mlflow_manager=None,
23
+ metric_logger=None,
24
24
  mlflow_run_id: str = None,
25
25
  completed_steps: int = 0,
26
26
  ):
@@ -36,7 +36,7 @@ class GenerationMetricsCallback(TrainerCallback):
36
36
  "pad_token_id": tokenizer.pad_token_id,
37
37
  "eos_token_id": tokenizer.eos_token_id,
38
38
  }
39
- self.mlflow_manager = mlflow_manager
39
+ self.metric_logger = metric_logger
40
40
  self.mlflow_run_id = mlflow_run_id
41
41
  self.completed_steps = completed_steps
42
42
 
@@ -63,8 +63,8 @@ class GenerationMetricsCallback(TrainerCallback):
63
63
  state.log_history.append(metrics)
64
64
 
65
65
  for key, value in metrics.items():
66
- if self.mlflow_manager:
67
- self.mlflow_manager.log_metric(
66
+ if self.metric_logger:
67
+ self.metric_logger.log_metric(
68
68
  self.mlflow_run_id,
69
69
  key,
70
70
  value,
@@ -72,41 +72,69 @@ class GenerationMetricsCallback(TrainerCallback):
72
72
  )
73
73
 
74
74
  def _prepare_data(self, eval_dataset: Dataset) -> tuple:
75
- """Prepare batch data for generation"""
75
+ """Prepare batch data for generation with defensive validation"""
76
76
  input_texts = []
77
77
  references = []
78
78
 
79
79
  for item in eval_dataset:
80
- if isinstance(item, dict):
81
- if "input" in item and "output" in item:
82
- input_text = item["input"]
83
- reference = item["output"]
84
- elif "prompt" in item and "completion" in item:
85
- input_text = item["prompt"]
86
- reference = item["completion"][-1]["content"]
87
- input_text = self.tokenizer.apply_chat_template(
88
- input_text, tokenize=False
89
- )
90
- else:
91
- continue
92
-
93
- input_texts.append(input_text)
94
- references.append(reference)
80
+ if not isinstance(item, dict):
81
+ continue
82
+
83
+ input_text = None
84
+ reference = None
85
+
86
+ # Support multiple field name patterns
87
+ if "input" in item and "output" in item:
88
+ input_text = item["input"]
89
+ reference = item["output"]
90
+ elif "prompt" in item and "completion" in item:
91
+ input_text = item["prompt"]
92
+ reference = item["completion"][-1]["content"]
93
+ input_text = self.tokenizer.apply_chat_template(
94
+ input_text, tokenize=False
95
+ )
96
+ elif "text" in item:
97
+ # SFT format - use text as input, response as reference
98
+ input_text = item["text"]
99
+ reference = item.get("response", item.get("instruction", item["text"]))
100
+ elif "instruction" in item and "response" in item:
101
+ # Direct instruction/response format
102
+ input_text = item["instruction"]
103
+ reference = item["response"]
104
+
105
+ # Validate non-empty strings
106
+ if input_text and isinstance(input_text, str) and input_text.strip():
107
+ if reference and isinstance(reference, str) and reference.strip():
108
+ input_texts.append(input_text.strip())
109
+ references.append(reference.strip())
110
+
111
+ # Return safe empty values to prevent downstream errors
112
+ if not input_texts:
113
+ return [], []
95
114
 
96
115
  return input_texts, references
97
116
 
98
- def _generate_batch(self, model, input_texts: List[str]) -> List[str]:
99
- """Generate text for a batch of inputs"""
100
- # Tokenize batch
101
- inputs = self.tokenizer(
102
- input_texts,
103
- return_tensors="pt",
104
- padding=True,
105
- truncation=True,
106
- max_length=512, # Adjust based on your model's context length
107
- ).to(model.device)
117
+ def _generate_batch(self, model, input_texts: List[str]) -> torch.Tensor:
118
+ """Generate text for a batch of inputs with defensive validation"""
119
+ # Defensive validation for empty inputs
120
+ if not input_texts:
121
+ return torch.empty((0, 0), dtype=torch.long).to(model.device)
108
122
 
109
- return inputs["input_ids"]
123
+ try:
124
+ # Tokenize batch
125
+ inputs = self.tokenizer(
126
+ input_texts,
127
+ return_tensors="pt",
128
+ padding=True,
129
+ truncation=True,
130
+ max_length=512, # Adjust based on your model's context length
131
+ ).to(model.device)
132
+
133
+ return inputs["input_ids"]
134
+ except Exception as e:
135
+ # Log error and return empty tensor to prevent crash
136
+ print(f"Warning: Tokenization error in generation callback: {e}")
137
+ return torch.empty((0, 0), dtype=torch.long).to(model.device)
110
138
 
111
139
  def _compute_generation_metrics(self, model, step: int) -> Dict[str, float]:
112
140
  """Generate text and compute BLEU/ROUGE metrics with batch processing"""
@@ -121,7 +149,19 @@ class GenerationMetricsCallback(TrainerCallback):
121
149
 
122
150
  # Process in batches
123
151
  input_texts, batch_references = self._prepare_data(self.eval_dataset)
152
+
153
+ # Early return if no valid data
154
+ if not input_texts:
155
+ print("Warning: No valid eval data for generation metrics")
156
+ return {}
157
+
124
158
  input_ids = self._generate_batch(model, input_texts)
159
+
160
+ # Check for empty generation batch
161
+ if input_ids.numel() == 0:
162
+ print("Warning: Empty input_ids from tokenization")
163
+ return {}
164
+
125
165
  with torch.no_grad():
126
166
  for i in tqdm(
127
167
  range(0, len(indices), self.batch_size), desc="Generating for metrics"
@@ -155,18 +195,18 @@ class GenerationMetricsCallback(TrainerCallback):
155
195
 
156
196
 
157
197
  class MLflowLoggingCallback(TrainerCallback):
158
- """Callback for logging metrics to MLflow during training"""
198
+ """Callback for logging metrics to tracking backend during training"""
159
199
 
160
200
  def __init__(
161
201
  self,
162
- mlflow_manager,
202
+ metric_logger,
163
203
  mlflow_run_id: str,
164
204
  excluded_keys: list = None,
165
205
  completed_steps: int = 0,
166
206
  chunk_id: int = 0,
167
207
  num_epochs_completed: int = 0,
168
208
  ):
169
- self.mlflow_manager = mlflow_manager
209
+ self.metric_logger = metric_logger
170
210
  self.mlflow_run_id = mlflow_run_id
171
211
  self.completed_steps = completed_steps
172
212
  self.excluded_keys = excluded_keys or [
@@ -189,22 +229,22 @@ class MLflowLoggingCallback(TrainerCallback):
189
229
  for key, value in logs.items():
190
230
  if isinstance(value, (int, float)) and key not in self.excluded_keys:
191
231
  try:
192
- self.mlflow_manager.log_metric(
232
+ self.metric_logger.log_metric(
193
233
  self.mlflow_run_id,
194
234
  key,
195
235
  value,
196
236
  step=self.completed_steps + state.global_step,
197
237
  )
198
238
  except Exception as e:
199
- print(f"Warning: Failed to log metric {key} to MLflow: {e}")
239
+ print(f"Warning: Failed to log metric {key} to tracking backend: {e}")
200
240
  if "eval_loss" not in logs and "train_runtime" not in logs:
201
- self.mlflow_manager.log_metric(
241
+ self.metric_logger.log_metric(
202
242
  self.mlflow_run_id,
203
243
  "chunk number",
204
244
  self.chunk_id,
205
245
  step=self.completed_steps + state.global_step,
206
246
  )
207
- self.mlflow_manager.log_metric(
247
+ self.metric_logger.log_metric(
208
248
  self.mlflow_run_id,
209
249
  "num_epochs_completed",
210
250
  self.num_epochs_completed,
rapidfireai/ml/trainer.py CHANGED
@@ -34,7 +34,7 @@ def create_trainer_instance(
34
34
  trainer_config: TrainerConfig,
35
35
  shm_manager: SharedMemoryManager,
36
36
  use_shared_memory: bool = False,
37
- mlflow_manager=None,
37
+ metric_logger=None,
38
38
  chunk_id: int = 0,
39
39
  ) -> tuple[SFTTrainer | DPOTrainer | GRPOTrainer | None, str]:
40
40
  """
@@ -98,7 +98,7 @@ def create_trainer_instance(
98
98
 
99
99
  callbacks, additional_trainer_kwargs = (
100
100
  _setup_callbacks( # FIXME: avoid returning additional_trainer_kwargs
101
- mlflow_manager,
101
+ metric_logger,
102
102
  trainer_config,
103
103
  chunk_id,
104
104
  compute_metrics,
@@ -314,7 +314,7 @@ def _prepare_trainer_kwargs(
314
314
 
315
315
 
316
316
  def _setup_callbacks(
317
- mlflow_manager,
317
+ metric_logger,
318
318
  trainer_config,
319
319
  chunk_id,
320
320
  compute_metrics,
@@ -327,9 +327,9 @@ def _setup_callbacks(
327
327
  """Setup callbacks for the trainer."""
328
328
  callbacks = []
329
329
 
330
- if mlflow_manager is not None and trainer_config.mlflow_run_id is not None:
330
+ if metric_logger is not None and trainer_config.mlflow_run_id is not None:
331
331
  mlflow_callback = MLflowLoggingCallback(
332
- mlflow_manager=mlflow_manager,
332
+ metric_logger=metric_logger,
333
333
  mlflow_run_id=trainer_config.mlflow_run_id,
334
334
  completed_steps=trainer_config.completed_steps,
335
335
  chunk_id=chunk_id,
@@ -353,7 +353,7 @@ def _setup_callbacks(
353
353
  generation_config=additional_trainer_kwargs.get("generation_config"),
354
354
  compute_metrics=compute_metrics_function,
355
355
  batch_size=training_args.get("per_device_eval_batch_size"),
356
- mlflow_manager=mlflow_manager,
356
+ metric_logger=metric_logger,
357
357
  mlflow_run_id=trainer_config.mlflow_run_id,
358
358
  completed_steps=trainer_config.completed_steps,
359
359
  )
rapidfireai/start.sh CHANGED
@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
12
12
  RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
13
13
  RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
14
14
  # API server configuration - these should match DispatcherConfig in constants.py
15
- RF_API_PORT=${RF_API_PORT:=8080}
15
+ RF_API_PORT=${RF_API_PORT:=8081}
16
16
  RF_API_HOST=${RF_API_HOST:=127.0.0.1}
17
17
 
18
18
  RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
19
19
 
20
+ # Colab mode configuration
21
+ RF_COLAB_MODE=${RF_COLAB_MODE:=false}
22
+ RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
23
+
20
24
  # Colors for output
21
25
  RED='\033[0;31m'
22
26
  GREEN='\033[0;32m'
@@ -124,20 +128,15 @@ cleanup() {
124
128
  rm -f "$RF_PID_FILE"
125
129
  fi
126
130
 
127
- # Final cleanup - kill any remaining MLflow, gunicorn, or Flask processes
128
- pkill -f "mlflow server" 2>/dev/null || true
129
- pkill -f "gunicorn.*rapidfireai" 2>/dev/null || true
130
- pkill -f "python3.*server.py" 2>/dev/null || true
131
- pkill -f "python.*server.py" 2>/dev/null || true
132
-
133
- # Additional cleanup for any remaining processes on our ports
134
- for port in $RF_MLFLOW_PORT $RF_FRONTEND_PORT $RF_API_PORT; do
135
- local remaining_pids=$(lsof -ti :$port 2>/dev/null || true)
136
- if [[ -n "$remaining_pids" ]]; then
137
- print_status "Force killing remaining processes on port $port"
138
- echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
139
- fi
140
- done
131
+ # Final cleanup - ONLY if NOT in Colab mode
132
+ # Colab mode skips this to avoid killing Jupyter/IPython infrastructure
133
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
134
+ # Safe, specific patterns for non-Colab environments
135
+ pkill -f "mlflow server" 2>/dev/null || true
136
+ pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
137
+ # Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
138
+ pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
139
+ fi
141
140
 
142
141
  print_success "All services stopped"
143
142
  exit 0
@@ -280,6 +279,10 @@ start_mlflow() {
280
279
  grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
281
280
  fi
282
281
  else
282
+ if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
283
+ print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
284
+ return 0
285
+ fi
283
286
  print_error "No mlflow.log file found"
284
287
  fi
285
288
 
@@ -293,6 +296,19 @@ start_mlflow() {
293
296
  fi
294
297
  }
295
298
 
299
+ # Function to conditionally start MLflow based on mode
300
+ start_mlflow_if_needed() {
301
+ # In Colab mode with pure TensorBoard, skip MLflow
302
+ if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
303
+ print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
304
+ return 0
305
+ fi
306
+
307
+ # Otherwise start MLflow
308
+ start_mlflow
309
+ return $?
310
+ }
311
+
296
312
  # Function to start API server
297
313
  start_api_server() {
298
314
  print_status "Starting API server with Gunicorn..."
@@ -481,6 +497,19 @@ start_frontend() {
481
497
  return 0
482
498
  }
483
499
 
500
+ # Function to conditionally start frontend based on mode
501
+ start_frontend_if_needed() {
502
+ # In Colab mode, always skip frontend
503
+ if [[ "$RF_COLAB_MODE" == "true" ]]; then
504
+ print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
505
+ return 0
506
+ fi
507
+
508
+ # Otherwise start frontend
509
+ start_frontend
510
+ return $?
511
+ }
512
+
484
513
  # Function to display running services
485
514
  show_status() {
486
515
  print_status "RapidFire AI Services Status:"
@@ -499,47 +528,101 @@ show_status() {
499
528
  fi
500
529
 
501
530
  echo ""
502
- print_success "🚀 RapidFire Frontend is ready!"
503
- print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
504
- print_status " (Click the link above or copy/paste the URL into your browser)"
531
+
532
+ # Display appropriate message based on mode
533
+ if [[ "$RF_COLAB_MODE" == "true" ]]; then
534
+ print_success "🚀 RapidFire running in Colab mode!"
535
+ print_status "📊 Use TensorBoard for metrics visualization:"
536
+ print_status " In a Colab notebook cell, run:"
537
+ print_status " %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
538
+ if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
539
+ print_status ""
540
+ print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
541
+ fi
542
+ else
543
+ print_success "🚀 RapidFire Frontend is ready!"
544
+ print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
545
+ print_status " (Click the link above or copy/paste the URL into your browser)"
546
+ fi
505
547
 
506
548
  # Show log file status
507
549
  echo ""
508
550
  print_status "Log files:"
509
- for log_file in "mlflow.log" "api.log" "frontend.log"; do
510
- if [[ -f "$SCRIPT_DIR/$log_file" ]]; then
511
- local size=$(du -h "$SCRIPT_DIR/$log_file" | cut -f1)
512
- print_status "- $log_file: $size"
551
+
552
+ # Always check api.log
553
+ if [[ -f "$SCRIPT_DIR/api.log" ]]; then
554
+ local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
555
+ print_status "- api.log: $size"
556
+ else
557
+ print_warning "- api.log: not found"
558
+ fi
559
+
560
+ # Only check mlflow.log if MLflow is running
561
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
562
+ if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
563
+ local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
564
+ print_status "- mlflow.log: $size"
513
565
  else
514
- print_warning "- $log_file: not found"
566
+ print_warning "- mlflow.log: not found"
515
567
  fi
516
- done
568
+ fi
569
+
570
+ # Only check frontend.log if frontend is running
571
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
572
+ if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
573
+ local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
574
+ print_status "- frontend.log: $size"
575
+ else
576
+ print_warning "- frontend.log: not found"
577
+ fi
578
+ fi
517
579
  }
518
580
 
519
581
  # Function to start services based on mode
520
582
  start_services() {
521
583
  local services_started=0
522
- local total_services=3
584
+ local total_services=1 # API server always runs
523
585
 
524
- # Start MLflow server
525
- if start_mlflow; then
526
- ((services_started++))
586
+ # Calculate total services based on mode
587
+ # MLflow runs unless tensorboard-only in Colab
588
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
589
+ ((total_services++))
590
+ fi
591
+
592
+ # Frontend runs unless Colab mode
593
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
594
+ ((total_services++))
595
+ fi
596
+
597
+ print_status "Starting $total_services service(s)..."
598
+
599
+ # Start MLflow server (conditionally)
600
+ if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
601
+ if start_mlflow; then
602
+ ((services_started++))
603
+ else
604
+ print_error "Failed to start MLflow server"
605
+ fi
527
606
  else
528
- print_error "Failed to start MLflow server"
607
+ print_status " Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
529
608
  fi
530
609
 
531
- # Start API server
610
+ # Start API server (always)
532
611
  if start_api_server; then
533
612
  ((services_started++))
534
613
  else
535
614
  print_error "Failed to start API server"
536
615
  fi
537
616
 
538
- # Start frontend server
539
- if start_frontend; then
540
- ((services_started++))
617
+ # Start frontend server (conditionally)
618
+ if [[ "$RF_COLAB_MODE" != "true" ]]; then
619
+ if start_frontend; then
620
+ ((services_started++))
621
+ else
622
+ print_error "Failed to start frontend server"
623
+ fi
541
624
  else
542
- print_error "Failed to start frontend server"
625
+ print_status " Skipping frontend (using TensorBoard in Colab mode)"
543
626
  fi
544
627
 
545
628
  return $((total_services - services_started))
@@ -1,7 +1,26 @@
1
1
  from enum import Enum
2
+ import os
2
3
 
3
4
  MLFLOW_URL = "http://127.0.0.1:5002"
4
5
 
6
+ # Tracking Backend Configuration
7
+ def get_tracking_backend() -> str:
8
+ """
9
+ Get the tracking backend from environment variable at runtime.
10
+
11
+ Returns:
12
+ str: The tracking backend ('mlflow', 'tensorboard', or 'both')
13
+
14
+ Note: This reads from os.environ at runtime to allow setting the env var
15
+ after module import (important for notebook environments like Colab).
16
+ """
17
+ backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
18
+ return backend
19
+
20
+ # Backwards compatibility: Keep constant but it will be stale if env var changes after import
21
+ TRACKING_BACKEND = get_tracking_backend() # Options: 'mlflow', 'tensorboard', 'both'
22
+ TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None) # Default set by experiment path
23
+
5
24
  # Shared Memory Constants
6
25
  SHM_WARN_THRESHOLD = 80
7
26
  SHM_MIN_FREE_SPACE = 1.0
@@ -24,7 +43,7 @@ class DispatcherConfig:
24
43
  """Class to manage the dispatcher configuration"""
25
44
 
26
45
  HOST: str = "127.0.0.1"
27
- PORT: int = 8080
46
+ PORT: int = 8081
28
47
 
29
48
 
30
49
  # Database Constants