rapidfireai 0.10.2rc4__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rapidfireai might be problematic. Click here for more details.

Files changed (37) hide show
  1. rapidfireai/backend/controller.py +29 -16
  2. rapidfireai/backend/worker.py +14 -7
  3. rapidfireai/cli.py +28 -1
  4. rapidfireai/db/rf_db.py +1 -1
  5. rapidfireai/db/tables.sql +1 -1
  6. rapidfireai/dispatcher/dispatcher.py +3 -1
  7. rapidfireai/dispatcher/gunicorn.conf.py +1 -1
  8. rapidfireai/experiment.py +75 -7
  9. rapidfireai/frontend/build/asset-manifest.json +3 -3
  10. rapidfireai/frontend/build/index.html +1 -1
  11. rapidfireai/frontend/build/static/js/{main.3ff1e37d.js → main.e7d3b759.js} +3 -3
  12. rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.map → main.e7d3b759.js.map} +1 -1
  13. rapidfireai/frontend/proxy_middleware.py +1 -1
  14. rapidfireai/ml/callbacks.py +78 -38
  15. rapidfireai/ml/trainer.py +6 -6
  16. rapidfireai/start.sh +117 -34
  17. rapidfireai/utils/constants.py +20 -1
  18. rapidfireai/utils/experiment_utils.py +87 -43
  19. rapidfireai/utils/interactive_controller.py +494 -0
  20. rapidfireai/utils/metric_logger.py +346 -0
  21. rapidfireai/utils/mlflow_manager.py +0 -2
  22. rapidfireai/utils/worker_manager.py +16 -6
  23. rapidfireai/version.py +2 -2
  24. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
  25. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +37 -34
  26. tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
  27. tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb +6 -6
  28. tutorial_notebooks/rf-tutorial-dpo-alignment.ipynb +6 -6
  29. tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb +6 -6
  30. tutorial_notebooks/rf-tutorial-grpo-mathreasoning.ipynb +6 -6
  31. tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb +6 -6
  32. tutorial_notebooks/rf-tutorial-sft-chatqa.ipynb +6 -6
  33. /rapidfireai/frontend/build/static/js/{main.3ff1e37d.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
  34. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
  35. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
  36. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
  37. {rapidfireai-0.10.2rc4.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,6 @@ import sys
8
8
  import warnings
9
9
  from typing import Any
10
10
 
11
- import mlflow
12
11
  import pandas as pd
13
12
  import torch
14
13
  from IPython.display import display
@@ -16,11 +15,13 @@ from tqdm import tqdm
16
15
  from transformers import logging as transformers_logging
17
16
 
18
17
  from rapidfireai.db.rf_db import RfDb
19
- from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
18
+ from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
20
19
  from rapidfireai.utils.datapaths import DataPath
21
20
  from rapidfireai.utils.exceptions import DBException, ExperimentException
22
21
  from rapidfireai.utils.logging import RFLogger
23
- from rapidfireai.utils.mlflow_manager import MLflowManager
22
+
23
+ # Note: mlflow and MLflowManager are imported lazily inside conditional blocks
24
+ # to avoid MLflow connection attempts when using tensorboard-only mode
24
25
 
25
26
 
26
27
  class ExperimentUtils:
@@ -82,12 +83,16 @@ class ExperimentUtils:
82
83
  self._disable_ml_warnings_display()
83
84
 
84
85
  # Clear any existing MLflow context before starting new experiment
85
- try:
86
- if mlflow.active_run():
87
- print("Clearing existing MLflow context before starting new experiment")
88
- mlflow.end_run()
89
- except Exception as e:
90
- print(f"Error clearing existing MLflow context: {e}")
86
+ # Only if using MLflow backend
87
+ tracking_backend = get_tracking_backend()
88
+ if tracking_backend in ["mlflow", "both"]:
89
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
90
+ try:
91
+ if mlflow.active_run():
92
+ print("Clearing existing MLflow context before starting new experiment")
93
+ mlflow.end_run()
94
+ except Exception as e:
95
+ print(f"Error clearing existing MLflow context: {e}")
91
96
 
92
97
  # check if experiment is already running
93
98
  running_experiment = None
@@ -124,11 +129,18 @@ class ExperimentUtils:
124
129
  given_name,
125
130
  experiments_path,
126
131
  )
127
- msg = (
128
- f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
129
- f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
130
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
131
- )
132
+ if mlflow_experiment_id:
133
+ msg = (
134
+ f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
135
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
136
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
137
+ )
138
+ else:
139
+ msg = (
140
+ f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
141
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
142
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
143
+ )
132
144
  print(msg)
133
145
  log_messages.append(msg)
134
146
  # check if experiment name already exists
@@ -137,11 +149,18 @@ class ExperimentUtils:
137
149
  given_name,
138
150
  experiments_path,
139
151
  )
140
- msg = (
141
- "An experiment with the same name already exists."
142
- f" Created a new experiment with name '{experiment_name}' with Experiment ID: {experiment_id}"
143
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
144
- )
152
+ if mlflow_experiment_id:
153
+ msg = (
154
+ "An experiment with the same name already exists."
155
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
156
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
157
+ )
158
+ else:
159
+ msg = (
160
+ "An experiment with the same name already exists."
161
+ f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
162
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
163
+ )
145
164
  print(msg)
146
165
  log_messages.append(msg)
147
166
  else:
@@ -149,10 +168,16 @@ class ExperimentUtils:
149
168
  given_name,
150
169
  experiments_path,
151
170
  )
152
- msg = (
153
- f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
154
- f" and MLFlow Experiment ID: {mlflow_experiment_id} saved at {experiments_path}/{experiment_name}"
155
- )
171
+ if mlflow_experiment_id:
172
+ msg = (
173
+ f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
174
+ f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
175
+ )
176
+ else:
177
+ msg = (
178
+ f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
179
+ f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
180
+ )
156
181
  print(msg)
157
182
  log_messages.append(msg)
158
183
 
@@ -185,20 +210,24 @@ class ExperimentUtils:
185
210
  self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
186
211
  self.db.reset_all_tables()
187
212
 
188
- # Clear MLflow context
189
- try:
190
- if mlflow.active_run():
191
- print("Ending active MLflow run before ending experiment")
192
- mlflow.end_run()
193
-
194
- # Also clear context through MLflowManager if available
213
+ # Clear MLflow context only if using MLflow backend
214
+ tracking_backend = get_tracking_backend()
215
+ if tracking_backend in ["mlflow", "both"]:
216
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
217
+ from rapidfireai.utils.mlflow_manager import MLflowManager
195
218
  try:
196
- mlflow_manager = MLflowManager(MLFLOW_URL)
197
- mlflow_manager.clear_context()
198
- except Exception as e2:
199
- print(f"[Error clearing MLflow context through MLflowManager: {e2}")
200
- except Exception as e:
201
- print(f"Error clearing MLflow context: {e}")
219
+ if mlflow.active_run():
220
+ print("Ending active MLflow run before ending experiment")
221
+ mlflow.end_run()
222
+
223
+ # Also clear context through MLflowManager if available
224
+ try:
225
+ mlflow_manager = MLflowManager(MLFLOW_URL)
226
+ mlflow_manager.clear_context()
227
+ except Exception as e2:
228
+ print(f"Error clearing MLflow context through MLflowManager: {e2}")
229
+ except Exception as e:
230
+ print(f"Error clearing MLflow context: {e}")
202
231
 
203
232
  # print experiment ended message
204
233
  msg = f"Experiment {experiment_name} ended"
@@ -311,28 +340,43 @@ class ExperimentUtils:
311
340
  print(f"Error displaying runs info: {e}")
312
341
  raise
313
342
 
314
- def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
343
+ def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
315
344
  """Create new experiment -
316
345
  if given_name already exists - increment suffix and create new experiment
317
346
  if given_name is new - create new experiment with given name
347
+ Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
318
348
  """
319
349
  try:
320
350
  given_name = given_name if given_name else "rf-exp"
321
351
  experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
322
352
 
323
- mlflow_manager = MLflowManager(MLFLOW_URL)
324
- mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
325
- mlflow.tracing.disable_notebook_display()
353
+ # Create MLflow experiment only if using MLflow backend
354
+ mlflow_experiment_id = None
355
+ tracking_backend = get_tracking_backend()
356
+ if tracking_backend in ["mlflow", "both"]:
357
+ import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
358
+ from rapidfireai.utils.mlflow_manager import MLflowManager
359
+ try:
360
+ mlflow_manager = MLflowManager(MLFLOW_URL)
361
+ mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
362
+ mlflow.tracing.disable_notebook_display()
363
+ except Exception as e:
364
+ # Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
365
+ raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
326
366
 
327
367
  # write new experiment details to database
328
368
  experiment_id = self.db.create_experiment(
329
369
  experiment_name,
330
- mlflow_experiment_id,
370
+ mlflow_experiment_id, # Will be None for tensorboard-only
331
371
  config_options={"experiments_path": experiments_path},
332
372
  )
333
373
  return experiment_id, experiment_name, mlflow_experiment_id
334
- except mlflow.exceptions.RestException as e: # pyright: ignore
335
- raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
374
+ except ExperimentException:
375
+ # Re-raise ExperimentExceptions (including MLflow errors from above)
376
+ raise
377
+ except Exception as e:
378
+ # Catch any other unexpected errors
379
+ raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
336
380
 
337
381
  def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
338
382
  """Increment the suffix of the name after the last '_' till it is unique"""