rapidfireai 0.10.2rc5__py3-none-any.whl → 0.10.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rapidfireai might be problematic. Click here for more details.
- rapidfireai/backend/controller.py +29 -16
- rapidfireai/backend/worker.py +14 -7
- rapidfireai/cli.py +28 -1
- rapidfireai/db/rf_db.py +1 -1
- rapidfireai/db/tables.sql +1 -1
- rapidfireai/dispatcher/dispatcher.py +3 -1
- rapidfireai/dispatcher/gunicorn.conf.py +1 -1
- rapidfireai/experiment.py +75 -7
- rapidfireai/frontend/build/asset-manifest.json +3 -3
- rapidfireai/frontend/build/index.html +1 -1
- rapidfireai/frontend/build/static/js/{main.1bf27639.js → main.e7d3b759.js} +3 -3
- rapidfireai/frontend/build/static/js/{main.1bf27639.js.map → main.e7d3b759.js.map} +1 -1
- rapidfireai/frontend/proxy_middleware.py +1 -1
- rapidfireai/ml/callbacks.py +78 -38
- rapidfireai/ml/trainer.py +6 -6
- rapidfireai/start.sh +117 -34
- rapidfireai/utils/constants.py +20 -1
- rapidfireai/utils/experiment_utils.py +87 -43
- rapidfireai/utils/interactive_controller.py +494 -0
- rapidfireai/utils/metric_logger.py +346 -0
- rapidfireai/utils/mlflow_manager.py +0 -2
- rapidfireai/utils/worker_manager.py +16 -6
- rapidfireai/version.py +2 -2
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/METADATA +7 -4
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/RECORD +31 -28
- tutorial_notebooks/rf-colab-tensorboard-tutorial.ipynb +314 -0
- /rapidfireai/frontend/build/static/js/{main.1bf27639.js.LICENSE.txt → main.e7d3b759.js.LICENSE.txt} +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/WHEEL +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/entry_points.txt +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/licenses/LICENSE +0 -0
- {rapidfireai-0.10.2rc5.dist-info → rapidfireai-0.10.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,6 @@ import sys
|
|
|
8
8
|
import warnings
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
import mlflow
|
|
12
11
|
import pandas as pd
|
|
13
12
|
import torch
|
|
14
13
|
from IPython.display import display
|
|
@@ -16,11 +15,13 @@ from tqdm import tqdm
|
|
|
16
15
|
from transformers import logging as transformers_logging
|
|
17
16
|
|
|
18
17
|
from rapidfireai.db.rf_db import RfDb
|
|
19
|
-
from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask
|
|
18
|
+
from rapidfireai.utils.constants import MLFLOW_URL, ExperimentStatus, ExperimentTask, get_tracking_backend
|
|
20
19
|
from rapidfireai.utils.datapaths import DataPath
|
|
21
20
|
from rapidfireai.utils.exceptions import DBException, ExperimentException
|
|
22
21
|
from rapidfireai.utils.logging import RFLogger
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
# Note: mlflow and MLflowManager are imported lazily inside conditional blocks
|
|
24
|
+
# to avoid MLflow connection attempts when using tensorboard-only mode
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class ExperimentUtils:
|
|
@@ -82,12 +83,16 @@ class ExperimentUtils:
|
|
|
82
83
|
self._disable_ml_warnings_display()
|
|
83
84
|
|
|
84
85
|
# Clear any existing MLflow context before starting new experiment
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
86
|
+
# Only if using MLflow backend
|
|
87
|
+
tracking_backend = get_tracking_backend()
|
|
88
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
89
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
90
|
+
try:
|
|
91
|
+
if mlflow.active_run():
|
|
92
|
+
print("Clearing existing MLflow context before starting new experiment")
|
|
93
|
+
mlflow.end_run()
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"Error clearing existing MLflow context: {e}")
|
|
91
96
|
|
|
92
97
|
# check if experiment is already running
|
|
93
98
|
running_experiment = None
|
|
@@ -124,11 +129,18 @@ class ExperimentUtils:
|
|
|
124
129
|
given_name,
|
|
125
130
|
experiments_path,
|
|
126
131
|
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
+
if mlflow_experiment_id:
|
|
133
|
+
msg = (
|
|
134
|
+
f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
|
|
135
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
136
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
msg = (
|
|
140
|
+
f"The previously running experiment {running_experiment['experiment_name']} was forcibly ended."
|
|
141
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
142
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
143
|
+
)
|
|
132
144
|
print(msg)
|
|
133
145
|
log_messages.append(msg)
|
|
134
146
|
# check if experiment name already exists
|
|
@@ -137,11 +149,18 @@ class ExperimentUtils:
|
|
|
137
149
|
given_name,
|
|
138
150
|
experiments_path,
|
|
139
151
|
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
152
|
+
if mlflow_experiment_id:
|
|
153
|
+
msg = (
|
|
154
|
+
"An experiment with the same name already exists."
|
|
155
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
156
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
msg = (
|
|
160
|
+
"An experiment with the same name already exists."
|
|
161
|
+
f" Created a new experiment '{experiment_name}' with Experiment ID: {experiment_id}"
|
|
162
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
163
|
+
)
|
|
145
164
|
print(msg)
|
|
146
165
|
log_messages.append(msg)
|
|
147
166
|
else:
|
|
@@ -149,10 +168,16 @@ class ExperimentUtils:
|
|
|
149
168
|
given_name,
|
|
150
169
|
experiments_path,
|
|
151
170
|
)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
171
|
+
if mlflow_experiment_id:
|
|
172
|
+
msg = (
|
|
173
|
+
f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
|
|
174
|
+
f" and MLflow Experiment ID: {mlflow_experiment_id} at {experiments_path}/{experiment_name}"
|
|
175
|
+
)
|
|
176
|
+
else:
|
|
177
|
+
msg = (
|
|
178
|
+
f"Experiment {experiment_name} created with Experiment ID: {experiment_id}"
|
|
179
|
+
f" at {experiments_path}/{experiment_name} (TensorBoard-only mode)"
|
|
180
|
+
)
|
|
156
181
|
print(msg)
|
|
157
182
|
log_messages.append(msg)
|
|
158
183
|
|
|
@@ -185,20 +210,24 @@ class ExperimentUtils:
|
|
|
185
210
|
self.db.set_experiment_status(current_experiment["experiment_id"], ExperimentStatus.COMPLETED)
|
|
186
211
|
self.db.reset_all_tables()
|
|
187
212
|
|
|
188
|
-
# Clear MLflow context
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# Also clear context through MLflowManager if available
|
|
213
|
+
# Clear MLflow context only if using MLflow backend
|
|
214
|
+
tracking_backend = get_tracking_backend()
|
|
215
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
216
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
217
|
+
from rapidfireai.utils.mlflow_manager import MLflowManager
|
|
195
218
|
try:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
219
|
+
if mlflow.active_run():
|
|
220
|
+
print("Ending active MLflow run before ending experiment")
|
|
221
|
+
mlflow.end_run()
|
|
222
|
+
|
|
223
|
+
# Also clear context through MLflowManager if available
|
|
224
|
+
try:
|
|
225
|
+
mlflow_manager = MLflowManager(MLFLOW_URL)
|
|
226
|
+
mlflow_manager.clear_context()
|
|
227
|
+
except Exception as e2:
|
|
228
|
+
print(f"Error clearing MLflow context through MLflowManager: {e2}")
|
|
229
|
+
except Exception as e:
|
|
230
|
+
print(f"Error clearing MLflow context: {e}")
|
|
202
231
|
|
|
203
232
|
# print experiment ended message
|
|
204
233
|
msg = f"Experiment {experiment_name} ended"
|
|
@@ -311,28 +340,43 @@ class ExperimentUtils:
|
|
|
311
340
|
print(f"Error displaying runs info: {e}")
|
|
312
341
|
raise
|
|
313
342
|
|
|
314
|
-
def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str]:
|
|
343
|
+
def _create_experiment_internal(self, given_name: str, experiments_path: str) -> tuple[int, str, str | None]:
|
|
315
344
|
"""Create new experiment -
|
|
316
345
|
if given_name already exists - increment suffix and create new experiment
|
|
317
346
|
if given_name is new - create new experiment with given name
|
|
347
|
+
Returns: experiment_id, experiment_name, mlflow_experiment_id (or None if tensorboard-only)
|
|
318
348
|
"""
|
|
319
349
|
try:
|
|
320
350
|
given_name = given_name if given_name else "rf-exp"
|
|
321
351
|
experiment_name = self._generate_unique_experiment_name(given_name, self.db.get_all_experiment_names())
|
|
322
352
|
|
|
323
|
-
|
|
324
|
-
mlflow_experiment_id =
|
|
325
|
-
|
|
353
|
+
# Create MLflow experiment only if using MLflow backend
|
|
354
|
+
mlflow_experiment_id = None
|
|
355
|
+
tracking_backend = get_tracking_backend()
|
|
356
|
+
if tracking_backend in ["mlflow", "both"]:
|
|
357
|
+
import mlflow # Lazy import to avoid connection attempts in tensorboard-only mode
|
|
358
|
+
from rapidfireai.utils.mlflow_manager import MLflowManager
|
|
359
|
+
try:
|
|
360
|
+
mlflow_manager = MLflowManager(MLFLOW_URL)
|
|
361
|
+
mlflow_experiment_id = mlflow_manager.create_experiment(experiment_name)
|
|
362
|
+
mlflow.tracing.disable_notebook_display()
|
|
363
|
+
except Exception as e:
|
|
364
|
+
# Catch MLflow-specific exceptions (mlflow.exceptions.RestException, etc.)
|
|
365
|
+
raise ExperimentException(f"Error creating MLFlow experiment: {e}") from e
|
|
326
366
|
|
|
327
367
|
# write new experiment details to database
|
|
328
368
|
experiment_id = self.db.create_experiment(
|
|
329
369
|
experiment_name,
|
|
330
|
-
mlflow_experiment_id,
|
|
370
|
+
mlflow_experiment_id, # Will be None for tensorboard-only
|
|
331
371
|
config_options={"experiments_path": experiments_path},
|
|
332
372
|
)
|
|
333
373
|
return experiment_id, experiment_name, mlflow_experiment_id
|
|
334
|
-
except
|
|
335
|
-
raise
|
|
374
|
+
except ExperimentException:
|
|
375
|
+
# Re-raise ExperimentExceptions (including MLflow errors from above)
|
|
376
|
+
raise
|
|
377
|
+
except Exception as e:
|
|
378
|
+
# Catch any other unexpected errors
|
|
379
|
+
raise ExperimentException(f"Error in _create_experiment_internal: {e}") from e
|
|
336
380
|
|
|
337
381
|
def _generate_unique_experiment_name(self, name: str, existing_names: list[str]) -> str:
|
|
338
382
|
"""Increment the suffix of the name after the last '_' till it is unique"""
|