rapidfireai 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidfireai might be problematic. Click here for more details.
- rapidfireai/backend/controller.py +29 -16
- rapidfireai/backend/worker.py +14 -7
- rapidfireai/cli.py +28 -1
- rapidfireai/db/rf_db.py +1 -1
- rapidfireai/db/tables.sql +1 -1
- rapidfireai/dispatcher/dispatcher.py +3 -1
- rapidfireai/dispatcher/gunicorn.conf.py +1 -1
- rapidfireai/experiment.py +75 -7
- rapidfireai/frontend/build/asset-manifest.json +3 -3
- rapidfireai/frontend/build/index.html +1 -1
- rapidfireai/frontend/build/static/js/{main.3ff1e37d.js → main.e7d3b759.js} +3 -3
- rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.map → main.e7d3b759.js.map} +1 -1
- rapidfireai/frontend/proxy_middleware.py +1 -1
- rapidfireai/ml/callbacks.py +78 -38
- rapidfireai/ml/trainer.py +6 -6
- rapidfireai/start.sh +117 -34
- rapidfireai/utils/constants.py +20 -1
- rapidfireai/utils/experiment_utils.py +87 -43
- rapidfireai/utils/interactive_controller.py +494 -0
- rapidfireai/utils/metric_logger.py +346 -0
- rapidfireai/utils/mlflow_manager.py +0 -2
- rapidfireai/utils/worker_manager.py +16 -6
- rapidfireai/version.py +2 -2
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +37 -34
- tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
- tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +6 -6
- tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +6 -6
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +6 -6
- tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +6 -6
- tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +6 -6
- tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +6 -6
- /rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
- {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -25,7 +25,7 @@ class UserProxyManager:
|
|
|
25
25
|
self.default_proxy = {
|
|
26
26
|
'main_proxy_target': 'http://127.0.0.1:5002/',
|
|
27
27
|
'static_proxy_target': 'http://127.0.0.1:5002/',
|
|
28
|
-
'dispatcher_proxy_target': 'http://127.0.0.1:
|
|
28
|
+
'dispatcher_proxy_target': 'http://127.0.0.1:8081/',
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
def get_user_proxy(self, user_id: str) -> Dict[str, str]:
|
rapidfireai/ml/callbacks.py
CHANGED
|
@@ -20,7 +20,7 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
20
20
|
generation_config: Optional[Dict] = None,
|
|
21
21
|
compute_metrics: Callable = None,
|
|
22
22
|
batch_size: int = 8,
|
|
23
|
-
|
|
23
|
+
metric_logger=None,
|
|
24
24
|
mlflow_run_id: str = None,
|
|
25
25
|
completed_steps: int = 0,
|
|
26
26
|
):
|
|
@@ -36,7 +36,7 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
36
36
|
"pad_token_id": tokenizer.pad_token_id,
|
|
37
37
|
"eos_token_id": tokenizer.eos_token_id,
|
|
38
38
|
}
|
|
39
|
-
self.
|
|
39
|
+
self.metric_logger = metric_logger
|
|
40
40
|
self.mlflow_run_id = mlflow_run_id
|
|
41
41
|
self.completed_steps = completed_steps
|
|
42
42
|
|
|
@@ -63,8 +63,8 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
63
63
|
state.log_history.append(metrics)
|
|
64
64
|
|
|
65
65
|
for key, value in metrics.items():
|
|
66
|
-
if self.
|
|
67
|
-
self.
|
|
66
|
+
if self.metric_logger:
|
|
67
|
+
self.metric_logger.log_metric(
|
|
68
68
|
self.mlflow_run_id,
|
|
69
69
|
key,
|
|
70
70
|
value,
|
|
@@ -72,41 +72,69 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
72
72
|
)
|
|
73
73
|
|
|
74
74
|
def _prepare_data(self, eval_dataset: Dataset) -> tuple:
|
|
75
|
-
"""Prepare batch data for generation"""
|
|
75
|
+
"""Prepare batch data for generation with defensive validation"""
|
|
76
76
|
input_texts = []
|
|
77
77
|
references = []
|
|
78
78
|
|
|
79
79
|
for item in eval_dataset:
|
|
80
|
-
if isinstance(item, dict):
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
80
|
+
if not isinstance(item, dict):
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
input_text = None
|
|
84
|
+
reference = None
|
|
85
|
+
|
|
86
|
+
# Support multiple field name patterns
|
|
87
|
+
if "input" in item and "output" in item:
|
|
88
|
+
input_text = item["input"]
|
|
89
|
+
reference = item["output"]
|
|
90
|
+
elif "prompt" in item and "completion" in item:
|
|
91
|
+
input_text = item["prompt"]
|
|
92
|
+
reference = item["completion"][-1]["content"]
|
|
93
|
+
input_text = self.tokenizer.apply_chat_template(
|
|
94
|
+
input_text, tokenize=False
|
|
95
|
+
)
|
|
96
|
+
elif "text" in item:
|
|
97
|
+
# SFT format - use text as input, response as reference
|
|
98
|
+
input_text = item["text"]
|
|
99
|
+
reference = item.get("response", item.get("instruction", item["text"]))
|
|
100
|
+
elif "instruction" in item and "response" in item:
|
|
101
|
+
# Direct instruction/response format
|
|
102
|
+
input_text = item["instruction"]
|
|
103
|
+
reference = item["response"]
|
|
104
|
+
|
|
105
|
+
# Validate non-empty strings
|
|
106
|
+
if input_text and isinstance(input_text, str) and input_text.strip():
|
|
107
|
+
if reference and isinstance(reference, str) and reference.strip():
|
|
108
|
+
input_texts.append(input_text.strip())
|
|
109
|
+
references.append(reference.strip())
|
|
110
|
+
|
|
111
|
+
# Return safe empty values to prevent downstream errors
|
|
112
|
+
if not input_texts:
|
|
113
|
+
return [], []
|
|
95
114
|
|
|
96
115
|
return input_texts, references
|
|
97
116
|
|
|
98
|
-
def _generate_batch(self, model, input_texts: List[str]) ->
|
|
99
|
-
"""Generate text for a batch of inputs"""
|
|
100
|
-
#
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
return_tensors="pt",
|
|
104
|
-
padding=True,
|
|
105
|
-
truncation=True,
|
|
106
|
-
max_length=512, # Adjust based on your model's context length
|
|
107
|
-
).to(model.device)
|
|
117
|
+
def _generate_batch(self, model, input_texts: List[str]) -> torch.Tensor:
|
|
118
|
+
"""Generate text for a batch of inputs with defensive validation"""
|
|
119
|
+
# Defensive validation for empty inputs
|
|
120
|
+
if not input_texts:
|
|
121
|
+
return torch.empty((0, 0), dtype=torch.long).to(model.device)
|
|
108
122
|
|
|
109
|
-
|
|
123
|
+
try:
|
|
124
|
+
# Tokenize batch
|
|
125
|
+
inputs = self.tokenizer(
|
|
126
|
+
input_texts,
|
|
127
|
+
return_tensors="pt",
|
|
128
|
+
padding=True,
|
|
129
|
+
truncation=True,
|
|
130
|
+
max_length=512, # Adjust based on your model's context length
|
|
131
|
+
).to(model.device)
|
|
132
|
+
|
|
133
|
+
return inputs["input_ids"]
|
|
134
|
+
except Exception as e:
|
|
135
|
+
# Log error and return empty tensor to prevent crash
|
|
136
|
+
print(f"Warning: Tokenization error in generation callback: {e}")
|
|
137
|
+
return torch.empty((0, 0), dtype=torch.long).to(model.device)
|
|
110
138
|
|
|
111
139
|
def _compute_generation_metrics(self, model, step: int) -> Dict[str, float]:
|
|
112
140
|
"""Generate text and compute BLEU/ROUGE metrics with batch processing"""
|
|
@@ -121,7 +149,19 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
121
149
|
|
|
122
150
|
# Process in batches
|
|
123
151
|
input_texts, batch_references = self._prepare_data(self.eval_dataset)
|
|
152
|
+
|
|
153
|
+
# Early return if no valid data
|
|
154
|
+
if not input_texts:
|
|
155
|
+
print("Warning: No valid eval data for generation metrics")
|
|
156
|
+
return {}
|
|
157
|
+
|
|
124
158
|
input_ids = self._generate_batch(model, input_texts)
|
|
159
|
+
|
|
160
|
+
# Check for empty generation batch
|
|
161
|
+
if input_ids.numel() == 0:
|
|
162
|
+
print("Warning: Empty input_ids from tokenization")
|
|
163
|
+
return {}
|
|
164
|
+
|
|
125
165
|
with torch.no_grad():
|
|
126
166
|
for i in tqdm(
|
|
127
167
|
range(0, len(indices), self.batch_size), desc="Generating for metrics"
|
|
@@ -155,18 +195,18 @@ class GenerationMetricsCallback(TrainerCallback):
|
|
|
155
195
|
|
|
156
196
|
|
|
157
197
|
class MLflowLoggingCallback(TrainerCallback):
|
|
158
|
-
"""Callback for logging metrics to
|
|
198
|
+
"""Callback for logging metrics to tracking backend during training"""
|
|
159
199
|
|
|
160
200
|
def __init__(
|
|
161
201
|
self,
|
|
162
|
-
|
|
202
|
+
metric_logger,
|
|
163
203
|
mlflow_run_id: str,
|
|
164
204
|
excluded_keys: list = None,
|
|
165
205
|
completed_steps: int = 0,
|
|
166
206
|
chunk_id: int = 0,
|
|
167
207
|
num_epochs_completed: int = 0,
|
|
168
208
|
):
|
|
169
|
-
self.
|
|
209
|
+
self.metric_logger = metric_logger
|
|
170
210
|
self.mlflow_run_id = mlflow_run_id
|
|
171
211
|
self.completed_steps = completed_steps
|
|
172
212
|
self.excluded_keys = excluded_keys or [
|
|
@@ -189,22 +229,22 @@ class MLflowLoggingCallback(TrainerCallback):
|
|
|
189
229
|
for key, value in logs.items():
|
|
190
230
|
if isinstance(value, (int, float)) and key not in self.excluded_keys:
|
|
191
231
|
try:
|
|
192
|
-
self.
|
|
232
|
+
self.metric_logger.log_metric(
|
|
193
233
|
self.mlflow_run_id,
|
|
194
234
|
key,
|
|
195
235
|
value,
|
|
196
236
|
step=self.completed_steps + state.global_step,
|
|
197
237
|
)
|
|
198
238
|
except Exception as e:
|
|
199
|
-
print(f"Warning: Failed to log metric {key} to
|
|
239
|
+
print(f"Warning: Failed to log metric {key} to tracking backend: {e}")
|
|
200
240
|
if "eval_loss" not in logs and "train_runtime" not in logs:
|
|
201
|
-
self.
|
|
241
|
+
self.metric_logger.log_metric(
|
|
202
242
|
self.mlflow_run_id,
|
|
203
243
|
"chunk number",
|
|
204
244
|
self.chunk_id,
|
|
205
245
|
step=self.completed_steps + state.global_step,
|
|
206
246
|
)
|
|
207
|
-
self.
|
|
247
|
+
self.metric_logger.log_metric(
|
|
208
248
|
self.mlflow_run_id,
|
|
209
249
|
"num_epochs_completed",
|
|
210
250
|
self.num_epochs_completed,
|
rapidfireai/ml/trainer.py
CHANGED
|
@@ -34,7 +34,7 @@ def create_trainer_instance(
|
|
|
34
34
|
trainer_config: TrainerConfig,
|
|
35
35
|
shm_manager: SharedMemoryManager,
|
|
36
36
|
use_shared_memory: bool = False,
|
|
37
|
-
|
|
37
|
+
metric_logger=None,
|
|
38
38
|
chunk_id: int = 0,
|
|
39
39
|
) -> tuple[SFTTrainer | DPOTrainer | GRPOTrainer | None, str]:
|
|
40
40
|
"""
|
|
@@ -98,7 +98,7 @@ def create_trainer_instance(
|
|
|
98
98
|
|
|
99
99
|
callbacks, additional_trainer_kwargs = (
|
|
100
100
|
_setup_callbacks( # FIXME: avoid returning additional_trainer_kwargs
|
|
101
|
-
|
|
101
|
+
metric_logger,
|
|
102
102
|
trainer_config,
|
|
103
103
|
chunk_id,
|
|
104
104
|
compute_metrics,
|
|
@@ -314,7 +314,7 @@ def _prepare_trainer_kwargs(
|
|
|
314
314
|
|
|
315
315
|
|
|
316
316
|
def _setup_callbacks(
|
|
317
|
-
|
|
317
|
+
metric_logger,
|
|
318
318
|
trainer_config,
|
|
319
319
|
chunk_id,
|
|
320
320
|
compute_metrics,
|
|
@@ -327,9 +327,9 @@ def _setup_callbacks(
|
|
|
327
327
|
"""Setup callbacks for the trainer."""
|
|
328
328
|
callbacks = []
|
|
329
329
|
|
|
330
|
-
if
|
|
330
|
+
if metric_logger is not None and trainer_config.mlflow_run_id is not None:
|
|
331
331
|
mlflow_callback = MLflowLoggingCallback(
|
|
332
|
-
|
|
332
|
+
metric_logger=metric_logger,
|
|
333
333
|
mlflow_run_id=trainer_config.mlflow_run_id,
|
|
334
334
|
completed_steps=trainer_config.completed_steps,
|
|
335
335
|
chunk_id=chunk_id,
|
|
@@ -353,7 +353,7 @@ def _setup_callbacks(
|
|
|
353
353
|
generation_config=additional_trainer_kwargs.get("generation_config"),
|
|
354
354
|
compute_metrics=compute_metrics_function,
|
|
355
355
|
batch_size=training_args.get("per_device_eval_batch_size"),
|
|
356
|
-
|
|
356
|
+
metric_logger=metric_logger,
|
|
357
357
|
mlflow_run_id=trainer_config.mlflow_run_id,
|
|
358
358
|
completed_steps=trainer_config.completed_steps,
|
|
359
359
|
)
|
rapidfireai/start.sh
CHANGED
|
@@ -12,11 +12,15 @@ RF_MLFLOW_HOST=${RF_MLFLOW_HOST:=127.0.0.1}
|
|
|
12
12
|
RF_FRONTEND_PORT=${RF_FRONTEND_PORT:=3000}
|
|
13
13
|
RF_FRONTEND_HOST=${RF_FRONTEND_HOST:=0.0.0.0}
|
|
14
14
|
# API server configuration - these should match DispatcherConfig in constants.py
|
|
15
|
-
RF_API_PORT=${RF_API_PORT:=
|
|
15
|
+
RF_API_PORT=${RF_API_PORT:=8081}
|
|
16
16
|
RF_API_HOST=${RF_API_HOST:=127.0.0.1}
|
|
17
17
|
|
|
18
18
|
RF_DB_PATH="${RF_DB_PATH:=$HOME/db}"
|
|
19
19
|
|
|
20
|
+
# Colab mode configuration
|
|
21
|
+
RF_COLAB_MODE=${RF_COLAB_MODE:=false}
|
|
22
|
+
RF_TRACKING_BACKEND=${RF_TRACKING_BACKEND:=mlflow}
|
|
23
|
+
|
|
20
24
|
# Colors for output
|
|
21
25
|
RED='\033[0;31m'
|
|
22
26
|
GREEN='\033[0;32m'
|
|
@@ -124,20 +128,15 @@ cleanup() {
|
|
|
124
128
|
rm -f "$RF_PID_FILE"
|
|
125
129
|
fi
|
|
126
130
|
|
|
127
|
-
# Final cleanup -
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
if [[ -n "$remaining_pids" ]]; then
|
|
137
|
-
print_status "Force killing remaining processes on port $port"
|
|
138
|
-
echo "$remaining_pids" | xargs kill -9 2>/dev/null || true
|
|
139
|
-
fi
|
|
140
|
-
done
|
|
131
|
+
# Final cleanup - ONLY if NOT in Colab mode
|
|
132
|
+
# Colab mode skips this to avoid killing Jupyter/IPython infrastructure
|
|
133
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
134
|
+
# Safe, specific patterns for non-Colab environments
|
|
135
|
+
pkill -f "mlflow server" 2>/dev/null || true
|
|
136
|
+
pkill -f "gunicorn.*rapidfireai.dispatcher" 2>/dev/null || true
|
|
137
|
+
# Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
|
|
138
|
+
pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
|
|
139
|
+
fi
|
|
141
140
|
|
|
142
141
|
print_success "All services stopped"
|
|
143
142
|
exit 0
|
|
@@ -280,6 +279,10 @@ start_mlflow() {
|
|
|
280
279
|
grep -A 5 -B 2 "Error\|Exception\|Traceback\|Failed\|ImportError\|ModuleNotFoundError" "$SCRIPT_DIR/mlflow.log" | head -20
|
|
281
280
|
fi
|
|
282
281
|
else
|
|
282
|
+
if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
|
|
283
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
284
|
+
return 0
|
|
285
|
+
fi
|
|
283
286
|
print_error "No mlflow.log file found"
|
|
284
287
|
fi
|
|
285
288
|
|
|
@@ -293,6 +296,19 @@ start_mlflow() {
|
|
|
293
296
|
fi
|
|
294
297
|
}
|
|
295
298
|
|
|
299
|
+
# Function to conditionally start MLflow based on mode
|
|
300
|
+
start_mlflow_if_needed() {
|
|
301
|
+
# In Colab mode with pure TensorBoard, skip MLflow
|
|
302
|
+
if [[ "$RF_COLAB_MODE" == "true" ]] && [[ "$RF_TRACKING_BACKEND" == "tensorboard" ]]; then
|
|
303
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
304
|
+
return 0
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
# Otherwise start MLflow
|
|
308
|
+
start_mlflow
|
|
309
|
+
return $?
|
|
310
|
+
}
|
|
311
|
+
|
|
296
312
|
# Function to start API server
|
|
297
313
|
start_api_server() {
|
|
298
314
|
print_status "Starting API server with Gunicorn..."
|
|
@@ -481,6 +497,19 @@ start_frontend() {
|
|
|
481
497
|
return 0
|
|
482
498
|
}
|
|
483
499
|
|
|
500
|
+
# Function to conditionally start frontend based on mode
|
|
501
|
+
start_frontend_if_needed() {
|
|
502
|
+
# In Colab mode, always skip frontend
|
|
503
|
+
if [[ "$RF_COLAB_MODE" == "true" ]]; then
|
|
504
|
+
print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
|
|
505
|
+
return 0
|
|
506
|
+
fi
|
|
507
|
+
|
|
508
|
+
# Otherwise start frontend
|
|
509
|
+
start_frontend
|
|
510
|
+
return $?
|
|
511
|
+
}
|
|
512
|
+
|
|
484
513
|
# Function to display running services
|
|
485
514
|
show_status() {
|
|
486
515
|
print_status "RapidFire AI Services Status:"
|
|
@@ -499,47 +528,101 @@ show_status() {
|
|
|
499
528
|
fi
|
|
500
529
|
|
|
501
530
|
echo ""
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
531
|
+
|
|
532
|
+
# Display appropriate message based on mode
|
|
533
|
+
if [[ "$RF_COLAB_MODE" == "true" ]]; then
|
|
534
|
+
print_success "🚀 RapidFire running in Colab mode!"
|
|
535
|
+
print_status "📊 Use TensorBoard for metrics visualization:"
|
|
536
|
+
print_status " In a Colab notebook cell, run:"
|
|
537
|
+
print_status " %tensorboard --logdir ~/experiments/{experiment_name}/tensorboard_logs"
|
|
538
|
+
if [[ "$RF_TRACKING_BACKEND" == "mlflow" ]] || [[ "$RF_TRACKING_BACKEND" == "both" ]]; then
|
|
539
|
+
print_status ""
|
|
540
|
+
print_status "📈 MLflow UI available at: http://$RF_MLFLOW_HOST:$RF_MLFLOW_PORT"
|
|
541
|
+
fi
|
|
542
|
+
else
|
|
543
|
+
print_success "🚀 RapidFire Frontend is ready!"
|
|
544
|
+
print_status "👉 Open your browser and navigate to: http://$RF_FRONTEND_HOST:$RF_FRONTEND_PORT"
|
|
545
|
+
print_status " (Click the link above or copy/paste the URL into your browser)"
|
|
546
|
+
fi
|
|
505
547
|
|
|
506
548
|
# Show log file status
|
|
507
549
|
echo ""
|
|
508
550
|
print_status "Log files:"
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
551
|
+
|
|
552
|
+
# Always check api.log
|
|
553
|
+
if [[ -f "$SCRIPT_DIR/api.log" ]]; then
|
|
554
|
+
local size=$(du -h "$SCRIPT_DIR/api.log" | cut -f1)
|
|
555
|
+
print_status "- api.log: $size"
|
|
556
|
+
else
|
|
557
|
+
print_warning "- api.log: not found"
|
|
558
|
+
fi
|
|
559
|
+
|
|
560
|
+
# Only check mlflow.log if MLflow is running
|
|
561
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
562
|
+
if [[ -f "$SCRIPT_DIR/mlflow.log" ]]; then
|
|
563
|
+
local size=$(du -h "$SCRIPT_DIR/mlflow.log" | cut -f1)
|
|
564
|
+
print_status "- mlflow.log: $size"
|
|
513
565
|
else
|
|
514
|
-
print_warning "-
|
|
566
|
+
print_warning "- mlflow.log: not found"
|
|
515
567
|
fi
|
|
516
|
-
|
|
568
|
+
fi
|
|
569
|
+
|
|
570
|
+
# Only check frontend.log if frontend is running
|
|
571
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
572
|
+
if [[ -f "$SCRIPT_DIR/frontend.log" ]]; then
|
|
573
|
+
local size=$(du -h "$SCRIPT_DIR/frontend.log" | cut -f1)
|
|
574
|
+
print_status "- frontend.log: $size"
|
|
575
|
+
else
|
|
576
|
+
print_warning "- frontend.log: not found"
|
|
577
|
+
fi
|
|
578
|
+
fi
|
|
517
579
|
}
|
|
518
580
|
|
|
519
581
|
# Function to start services based on mode
|
|
520
582
|
start_services() {
|
|
521
583
|
local services_started=0
|
|
522
|
-
local total_services=
|
|
584
|
+
local total_services=1 # API server always runs
|
|
523
585
|
|
|
524
|
-
#
|
|
525
|
-
|
|
526
|
-
|
|
586
|
+
# Calculate total services based on mode
|
|
587
|
+
# MLflow runs unless tensorboard-only in Colab
|
|
588
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
589
|
+
((total_services++))
|
|
590
|
+
fi
|
|
591
|
+
|
|
592
|
+
# Frontend runs unless Colab mode
|
|
593
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
594
|
+
((total_services++))
|
|
595
|
+
fi
|
|
596
|
+
|
|
597
|
+
print_status "Starting $total_services service(s)..."
|
|
598
|
+
|
|
599
|
+
# Start MLflow server (conditionally)
|
|
600
|
+
if [[ "$RF_COLAB_MODE" != "true" ]] || [[ "$RF_TRACKING_BACKEND" != "tensorboard" ]]; then
|
|
601
|
+
if start_mlflow; then
|
|
602
|
+
((services_started++))
|
|
603
|
+
else
|
|
604
|
+
print_error "Failed to start MLflow server"
|
|
605
|
+
fi
|
|
527
606
|
else
|
|
528
|
-
|
|
607
|
+
print_status "⊗ Skipping MLflow (using TensorBoard-only tracking in Colab mode)"
|
|
529
608
|
fi
|
|
530
609
|
|
|
531
|
-
# Start API server
|
|
610
|
+
# Start API server (always)
|
|
532
611
|
if start_api_server; then
|
|
533
612
|
((services_started++))
|
|
534
613
|
else
|
|
535
614
|
print_error "Failed to start API server"
|
|
536
615
|
fi
|
|
537
616
|
|
|
538
|
-
# Start frontend server
|
|
539
|
-
if
|
|
540
|
-
|
|
617
|
+
# Start frontend server (conditionally)
|
|
618
|
+
if [[ "$RF_COLAB_MODE" != "true" ]]; then
|
|
619
|
+
if start_frontend; then
|
|
620
|
+
((services_started++))
|
|
621
|
+
else
|
|
622
|
+
print_error "Failed to start frontend server"
|
|
623
|
+
fi
|
|
541
624
|
else
|
|
542
|
-
|
|
625
|
+
print_status "⊗ Skipping frontend (using TensorBoard in Colab mode)"
|
|
543
626
|
fi
|
|
544
627
|
|
|
545
628
|
return $((total_services - services_started))
|
rapidfireai/utils/constants.py
CHANGED
|
@@ -1,7 +1,26 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
import os
|
|
2
3
|
|
|
3
4
|
MLFLOW_URL = "http://127.0.0.1:5002"
|
|
4
5
|
|
|
6
|
+
# Tracking Backend Configuration
|
|
7
|
+
def get_tracking_backend() -> str:
|
|
8
|
+
"""
|
|
9
|
+
Get the tracking backend from environment variable at runtime.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
str: The tracking backend ('mlflow', 'tensorboard', or 'both')
|
|
13
|
+
|
|
14
|
+
Note: This reads from os.environ at runtime to allow setting the env var
|
|
15
|
+
after module import (important for notebook environments like Colab).
|
|
16
|
+
"""
|
|
17
|
+
backend = os.getenv("RF_TRACKING_BACKEND", "mlflow")
|
|
18
|
+
return backend
|
|
19
|
+
|
|
20
|
+
# Backwards compatibility: Keep constant but it will be stale if env var changes after import
|
|
21
|
+
TRACKING_BACKEND = get_tracking_backend() # Options: 'mlflow', 'tensorboard', 'both'
|
|
22
|
+
TENSORBOARD_LOG_DIR = os.getenv("RF_TENSORBOARD_LOG_DIR", None) # Default set by experiment path
|
|
23
|
+
|
|
5
24
|
# Shared Memory Constants
|
|
6
25
|
SHM_WARN_THRESHOLD = 80
|
|
7
26
|
SHM_MIN_FREE_SPACE = 1.0
|
|
@@ -24,7 +43,7 @@ class DispatcherConfig:
|
|
|
24
43
|
"""Class to manage the dispatcher configuration"""
|
|
25
44
|
|
|
26
45
|
HOST: str = "127.0.0.1"
|
|
27
|
-
PORT: int =
|
|
46
|
+
PORT: int = 8081
|
|
28
47
|
|
|
29
48
|
|
|
30
49
|
# Database Constants
|