mlrun 1.7.0rc6__py3-none-any.whl → 1.7.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +2 -0
- mlrun/common/constants.py +6 -0
- mlrun/common/schemas/__init__.py +3 -0
- mlrun/common/schemas/api_gateway.py +8 -1
- mlrun/common/schemas/model_monitoring/__init__.py +4 -0
- mlrun/common/schemas/model_monitoring/constants.py +35 -18
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/types.py +7 -1
- mlrun/config.py +34 -10
- mlrun/data_types/data_types.py +4 -0
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +22 -16
- mlrun/datastore/datastore.py +4 -0
- mlrun/datastore/datastore_profile.py +7 -0
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/sources.py +2 -3
- mlrun/datastore/targets.py +6 -1
- mlrun/db/base.py +14 -6
- mlrun/db/httpdb.py +61 -56
- mlrun/db/nopdb.py +3 -0
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +6 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +20 -8
- mlrun/kfpops.py +2 -5
- mlrun/model.py +1 -0
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +104 -295
- mlrun/model_monitoring/controller.py +25 -25
- mlrun/model_monitoring/db/__init__.py +16 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -34
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +47 -6
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +49 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +76 -3
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +68 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/sqlite.py +13 -1
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +662 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +134 -3
- mlrun/model_monitoring/helpers.py +3 -3
- mlrun/model_monitoring/stream_processing.py +41 -9
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +4 -36
- mlrun/projects/pipelines.py +14 -2
- mlrun/projects/project.py +118 -103
- mlrun/run.py +5 -1
- mlrun/runtimes/base.py +6 -0
- mlrun/runtimes/nuclio/api_gateway.py +218 -65
- mlrun/runtimes/nuclio/function.py +3 -0
- mlrun/runtimes/nuclio/serving.py +28 -32
- mlrun/runtimes/pod.py +26 -0
- mlrun/serving/routers.py +4 -3
- mlrun/serving/server.py +4 -6
- mlrun/serving/states.py +34 -14
- mlrun/serving/v2_serving.py +4 -3
- mlrun/utils/helpers.py +34 -0
- mlrun/utils/http.py +1 -1
- mlrun/utils/retryer.py +1 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/METADATA +25 -16
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/RECORD +66 -62
- mlrun/model_monitoring/batch.py +0 -933
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/mysql.py +0 -34
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/top_level.txt +0 -0
mlrun/model_monitoring/api.py
CHANGED
|
@@ -13,8 +13,8 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import hashlib
|
|
16
|
-
import json
|
|
17
16
|
import typing
|
|
17
|
+
import warnings
|
|
18
18
|
from datetime import datetime
|
|
19
19
|
|
|
20
20
|
import numpy as np
|
|
@@ -22,13 +22,13 @@ import pandas as pd
|
|
|
22
22
|
|
|
23
23
|
import mlrun.artifacts
|
|
24
24
|
import mlrun.common.helpers
|
|
25
|
+
import mlrun.common.schemas.model_monitoring.constants as mm_consts
|
|
25
26
|
import mlrun.feature_store
|
|
26
|
-
|
|
27
|
+
import mlrun.model_monitoring.application
|
|
28
|
+
import mlrun.serving
|
|
27
29
|
from mlrun.data_types.infer import InferOptions, get_df_stats
|
|
28
30
|
from mlrun.utils import datetime_now, logger
|
|
29
31
|
|
|
30
|
-
from .batch import VirtualDrift
|
|
31
|
-
from .features_drift_table import FeaturesDriftTablePlot
|
|
32
32
|
from .helpers import update_model_endpoint_last_request
|
|
33
33
|
from .model_endpoint import ModelEndpoint
|
|
34
34
|
|
|
@@ -48,7 +48,7 @@ def get_or_create_model_endpoint(
|
|
|
48
48
|
sample_set_statistics: dict[str, typing.Any] = None,
|
|
49
49
|
drift_threshold: float = None,
|
|
50
50
|
possible_drift_threshold: float = None,
|
|
51
|
-
monitoring_mode: ModelMonitoringMode = ModelMonitoringMode.disabled,
|
|
51
|
+
monitoring_mode: mm_consts.ModelMonitoringMode = mm_consts.ModelMonitoringMode.disabled,
|
|
52
52
|
db_session=None,
|
|
53
53
|
) -> ModelEndpoint:
|
|
54
54
|
"""
|
|
@@ -128,20 +128,19 @@ def record_results(
|
|
|
128
128
|
context: typing.Optional[mlrun.MLClientCtx] = None,
|
|
129
129
|
infer_results_df: typing.Optional[pd.DataFrame] = None,
|
|
130
130
|
sample_set_statistics: typing.Optional[dict[str, typing.Any]] = None,
|
|
131
|
-
monitoring_mode: ModelMonitoringMode = ModelMonitoringMode.enabled,
|
|
131
|
+
monitoring_mode: mm_consts.ModelMonitoringMode = mm_consts.ModelMonitoringMode.enabled,
|
|
132
|
+
# Deprecated arguments:
|
|
132
133
|
drift_threshold: typing.Optional[float] = None,
|
|
133
134
|
possible_drift_threshold: typing.Optional[float] = None,
|
|
134
135
|
trigger_monitoring_job: bool = False,
|
|
135
136
|
artifacts_tag: str = "",
|
|
136
|
-
default_batch_image="mlrun/mlrun",
|
|
137
|
+
default_batch_image: str = "mlrun/mlrun",
|
|
137
138
|
) -> ModelEndpoint:
|
|
138
139
|
"""
|
|
139
140
|
Write a provided inference dataset to model endpoint parquet target. If not exist, generate a new model endpoint
|
|
140
141
|
record and use the provided sample set statistics as feature stats that will be used later for the drift analysis.
|
|
141
|
-
To
|
|
142
|
-
|
|
143
|
-
input data (along with the outputs). The drift rule is the value per-feature mean of the TVD and Hellinger scores
|
|
144
|
-
according to the provided thresholds.
|
|
142
|
+
To activate model monitoring, run `project.enable_model_monitoring()`. The model monitoring applications will be
|
|
143
|
+
triggered with the recorded data according to a periodic schedule.
|
|
145
144
|
|
|
146
145
|
:param project: Project name.
|
|
147
146
|
:param model_path: The model Store path.
|
|
@@ -160,17 +159,47 @@ def record_results(
|
|
|
160
159
|
the current model endpoint.
|
|
161
160
|
:param monitoring_mode: If enabled, apply model monitoring features on the provided endpoint id. Enabled
|
|
162
161
|
by default.
|
|
163
|
-
:param drift_threshold: The threshold of which to mark drifts.
|
|
164
|
-
:param possible_drift_threshold: The threshold of which to mark possible drifts.
|
|
165
|
-
:param trigger_monitoring_job: If true, run the batch drift job. If not exists, the monitoring
|
|
166
|
-
will be registered through MLRun API with the provided image.
|
|
167
|
-
:param artifacts_tag: Tag to use for all the artifacts resulted from the function.
|
|
168
|
-
only if the monitoring batch job has been triggered.
|
|
169
|
-
|
|
170
|
-
|
|
162
|
+
:param drift_threshold: (deprecated) The threshold of which to mark drifts.
|
|
163
|
+
:param possible_drift_threshold: (deprecated) The threshold of which to mark possible drifts.
|
|
164
|
+
:param trigger_monitoring_job: (deprecated) If true, run the batch drift job. If not exists, the monitoring
|
|
165
|
+
batch function will be registered through MLRun API with the provided image.
|
|
166
|
+
:param artifacts_tag: (deprecated) Tag to use for all the artifacts resulted from the function.
|
|
167
|
+
Will be relevant only if the monitoring batch job has been triggered.
|
|
168
|
+
:param default_batch_image: (deprecated) The image that will be used when registering the model monitoring
|
|
169
|
+
batch job.
|
|
171
170
|
|
|
172
171
|
:return: A ModelEndpoint object
|
|
173
172
|
"""
|
|
173
|
+
|
|
174
|
+
if drift_threshold is not None or possible_drift_threshold is not None:
|
|
175
|
+
warnings.warn(
|
|
176
|
+
"Custom drift threshold arguments are deprecated since version "
|
|
177
|
+
"1.7.0 and have no effect. They will be removed in version 1.9.0.\n"
|
|
178
|
+
"To enable the default histogram data drift application, run:\n"
|
|
179
|
+
"`project.enable_model_monitoring()`.",
|
|
180
|
+
FutureWarning,
|
|
181
|
+
)
|
|
182
|
+
if trigger_monitoring_job is not False:
|
|
183
|
+
warnings.warn(
|
|
184
|
+
"`trigger_monitoring_job` argument is deprecated since version "
|
|
185
|
+
"1.7.0 and has no effect. It will be removed in version 1.9.0.\n"
|
|
186
|
+
"To enable the default histogram data drift application, run:\n"
|
|
187
|
+
"`project.enable_model_monitoring()`.",
|
|
188
|
+
FutureWarning,
|
|
189
|
+
)
|
|
190
|
+
if artifacts_tag != "":
|
|
191
|
+
warnings.warn(
|
|
192
|
+
"`artifacts_tag` argument is deprecated since version "
|
|
193
|
+
"1.7.0 and has no effect. It will be removed in version 1.9.0.",
|
|
194
|
+
FutureWarning,
|
|
195
|
+
)
|
|
196
|
+
if default_batch_image != "mlrun/mlrun":
|
|
197
|
+
warnings.warn(
|
|
198
|
+
"`default_batch_image` argument is deprecated since version "
|
|
199
|
+
"1.7.0 and has no effect. It will be removed in version 1.9.0.",
|
|
200
|
+
FutureWarning,
|
|
201
|
+
)
|
|
202
|
+
|
|
174
203
|
db = mlrun.get_run_db()
|
|
175
204
|
|
|
176
205
|
model_endpoint = get_or_create_model_endpoint(
|
|
@@ -181,8 +210,6 @@ def record_results(
|
|
|
181
210
|
function_name=function_name,
|
|
182
211
|
context=context,
|
|
183
212
|
sample_set_statistics=sample_set_statistics,
|
|
184
|
-
drift_threshold=drift_threshold,
|
|
185
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
186
213
|
monitoring_mode=monitoring_mode,
|
|
187
214
|
db_session=db,
|
|
188
215
|
)
|
|
@@ -206,33 +233,6 @@ def record_results(
|
|
|
206
233
|
db=db,
|
|
207
234
|
)
|
|
208
235
|
|
|
209
|
-
if trigger_monitoring_job:
|
|
210
|
-
# Run the monitoring batch drift job
|
|
211
|
-
trigger_drift_batch_job(
|
|
212
|
-
project=project,
|
|
213
|
-
default_batch_image=default_batch_image,
|
|
214
|
-
model_endpoints_ids=[model_endpoint.metadata.uid],
|
|
215
|
-
db_session=db,
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
# Getting drift thresholds if not provided
|
|
219
|
-
drift_threshold, possible_drift_threshold = get_drift_thresholds_if_not_none(
|
|
220
|
-
model_endpoint=model_endpoint,
|
|
221
|
-
drift_threshold=drift_threshold,
|
|
222
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
perform_drift_analysis(
|
|
226
|
-
project=project,
|
|
227
|
-
context=context,
|
|
228
|
-
sample_set_statistics=model_endpoint.status.feature_stats,
|
|
229
|
-
drift_threshold=drift_threshold,
|
|
230
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
231
|
-
artifacts_tag=artifacts_tag,
|
|
232
|
-
endpoint_id=model_endpoint.metadata.uid,
|
|
233
|
-
db_session=db,
|
|
234
|
-
)
|
|
235
|
-
|
|
236
236
|
return model_endpoint
|
|
237
237
|
|
|
238
238
|
|
|
@@ -282,7 +282,7 @@ def _model_endpoint_validations(
|
|
|
282
282
|
# drift and possible drift thresholds
|
|
283
283
|
if drift_threshold:
|
|
284
284
|
current_drift_threshold = model_endpoint.spec.monitor_configuration.get(
|
|
285
|
-
EventFieldType.DRIFT_DETECTED_THRESHOLD,
|
|
285
|
+
mm_consts.EventFieldType.DRIFT_DETECTED_THRESHOLD,
|
|
286
286
|
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected,
|
|
287
287
|
)
|
|
288
288
|
if current_drift_threshold != drift_threshold:
|
|
@@ -293,7 +293,7 @@ def _model_endpoint_validations(
|
|
|
293
293
|
|
|
294
294
|
if possible_drift_threshold:
|
|
295
295
|
current_possible_drift_threshold = model_endpoint.spec.monitor_configuration.get(
|
|
296
|
-
EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
|
|
296
|
+
mm_consts.EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
|
|
297
297
|
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift,
|
|
298
298
|
)
|
|
299
299
|
if current_possible_drift_threshold != possible_drift_threshold:
|
|
@@ -303,40 +303,6 @@ def _model_endpoint_validations(
|
|
|
303
303
|
)
|
|
304
304
|
|
|
305
305
|
|
|
306
|
-
def get_drift_thresholds_if_not_none(
|
|
307
|
-
model_endpoint: ModelEndpoint,
|
|
308
|
-
drift_threshold: float = None,
|
|
309
|
-
possible_drift_threshold: float = None,
|
|
310
|
-
) -> tuple[float, float]:
|
|
311
|
-
"""
|
|
312
|
-
Get drift and possible drift thresholds. If one of the thresholds is missing, will try to retrieve
|
|
313
|
-
it from the `ModelEndpoint` object. If not defined under the `ModelEndpoint` as well, will retrieve it from
|
|
314
|
-
the default mlrun configuration.
|
|
315
|
-
|
|
316
|
-
:param model_endpoint: `ModelEndpoint` object.
|
|
317
|
-
:param drift_threshold: The threshold of which to mark drifts.
|
|
318
|
-
:param possible_drift_threshold: The threshold of which to mark possible drifts.
|
|
319
|
-
|
|
320
|
-
:return: A Tuple of:
|
|
321
|
-
[0] drift threshold as a float
|
|
322
|
-
[1] possible drift threshold as a float
|
|
323
|
-
"""
|
|
324
|
-
if not drift_threshold:
|
|
325
|
-
# Getting drift threshold value from either model endpoint or monitoring default configurations
|
|
326
|
-
drift_threshold = model_endpoint.spec.monitor_configuration.get(
|
|
327
|
-
EventFieldType.DRIFT_DETECTED_THRESHOLD,
|
|
328
|
-
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.drift_detected,
|
|
329
|
-
)
|
|
330
|
-
if not possible_drift_threshold:
|
|
331
|
-
# Getting possible drift threshold value from either model endpoint or monitoring default configurations
|
|
332
|
-
possible_drift_threshold = model_endpoint.spec.monitor_configuration.get(
|
|
333
|
-
EventFieldType.POSSIBLE_DRIFT_THRESHOLD,
|
|
334
|
-
mlrun.mlconf.model_endpoint_monitoring.drift_thresholds.default.possible_drift,
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
return drift_threshold, possible_drift_threshold
|
|
338
|
-
|
|
339
|
-
|
|
340
306
|
def write_monitoring_df(
|
|
341
307
|
endpoint_id: str,
|
|
342
308
|
infer_results_df: pd.DataFrame,
|
|
@@ -366,14 +332,14 @@ def write_monitoring_df(
|
|
|
366
332
|
)
|
|
367
333
|
|
|
368
334
|
# Modify the DataFrame to the required structure that will be used later by the monitoring batch job
|
|
369
|
-
if EventFieldType.TIMESTAMP not in infer_results_df.columns:
|
|
335
|
+
if mm_consts.EventFieldType.TIMESTAMP not in infer_results_df.columns:
|
|
370
336
|
# Initialize timestamp column with the current time
|
|
371
|
-
infer_results_df[EventFieldType.TIMESTAMP] = infer_datetime
|
|
337
|
+
infer_results_df[mm_consts.EventFieldType.TIMESTAMP] = infer_datetime
|
|
372
338
|
|
|
373
339
|
# `endpoint_id` is the monitoring feature set entity and therefore it should be defined as the df index before
|
|
374
340
|
# the ingest process
|
|
375
|
-
infer_results_df[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
376
|
-
infer_results_df.set_index(EventFieldType.ENDPOINT_ID, inplace=True)
|
|
341
|
+
infer_results_df[mm_consts.EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
342
|
+
infer_results_df.set_index(mm_consts.EventFieldType.ENDPOINT_ID, inplace=True)
|
|
377
343
|
|
|
378
344
|
monitoring_feature_set.ingest(source=infer_results_df, overwrite=False)
|
|
379
345
|
|
|
@@ -389,7 +355,7 @@ def _generate_model_endpoint(
|
|
|
389
355
|
sample_set_statistics: dict[str, typing.Any],
|
|
390
356
|
drift_threshold: float,
|
|
391
357
|
possible_drift_threshold: float,
|
|
392
|
-
monitoring_mode: ModelMonitoringMode = ModelMonitoringMode.disabled,
|
|
358
|
+
monitoring_mode: mm_consts.ModelMonitoringMode = mm_consts.ModelMonitoringMode.disabled,
|
|
393
359
|
) -> ModelEndpoint:
|
|
394
360
|
"""
|
|
395
361
|
Write a new model endpoint record.
|
|
@@ -428,11 +394,11 @@ def _generate_model_endpoint(
|
|
|
428
394
|
model_endpoint.spec.model_class = "drift-analysis"
|
|
429
395
|
if drift_threshold:
|
|
430
396
|
model_endpoint.spec.monitor_configuration[
|
|
431
|
-
EventFieldType.DRIFT_DETECTED_THRESHOLD
|
|
397
|
+
mm_consts.EventFieldType.DRIFT_DETECTED_THRESHOLD
|
|
432
398
|
] = drift_threshold
|
|
433
399
|
if possible_drift_threshold:
|
|
434
400
|
model_endpoint.spec.monitor_configuration[
|
|
435
|
-
EventFieldType.POSSIBLE_DRIFT_THRESHOLD
|
|
401
|
+
mm_consts.EventFieldType.POSSIBLE_DRIFT_THRESHOLD
|
|
436
402
|
] = possible_drift_threshold
|
|
437
403
|
|
|
438
404
|
model_endpoint.spec.monitoring_mode = monitoring_mode
|
|
@@ -449,71 +415,6 @@ def _generate_model_endpoint(
|
|
|
449
415
|
return db_session.get_model_endpoint(project=project, endpoint_id=endpoint_id)
|
|
450
416
|
|
|
451
417
|
|
|
452
|
-
def trigger_drift_batch_job(
|
|
453
|
-
project: str,
|
|
454
|
-
default_batch_image="mlrun/mlrun",
|
|
455
|
-
model_endpoints_ids: list[str] = None,
|
|
456
|
-
batch_intervals_dict: dict[str, float] = None,
|
|
457
|
-
db_session=None,
|
|
458
|
-
):
|
|
459
|
-
"""
|
|
460
|
-
Run model monitoring drift analysis job. If not exists, the monitoring batch function will be registered through
|
|
461
|
-
MLRun API with the provided image.
|
|
462
|
-
|
|
463
|
-
:param project: Project name.
|
|
464
|
-
:param default_batch_image: The image that will be used when registering the model monitoring batch job.
|
|
465
|
-
:param model_endpoints_ids: List of model endpoints to include in the current run.
|
|
466
|
-
:param batch_intervals_dict: Batch interval range (days, hours, minutes). By default, the batch interval is
|
|
467
|
-
configured to run through the last hour.
|
|
468
|
-
:param db_session: A runtime session that manages the current dialog with the database.
|
|
469
|
-
|
|
470
|
-
"""
|
|
471
|
-
if not model_endpoints_ids:
|
|
472
|
-
raise mlrun.errors.MLRunNotFoundError(
|
|
473
|
-
"No model endpoints provided",
|
|
474
|
-
)
|
|
475
|
-
if not db_session:
|
|
476
|
-
db_session = mlrun.get_run_db()
|
|
477
|
-
|
|
478
|
-
# Register the monitoring batch job (do nothing if already exist) and get the job function as a dictionary
|
|
479
|
-
batch_function_dict: dict[str, typing.Any] = db_session.deploy_monitoring_batch_job(
|
|
480
|
-
project=project,
|
|
481
|
-
default_batch_image=default_batch_image,
|
|
482
|
-
)
|
|
483
|
-
|
|
484
|
-
# Prepare current run params
|
|
485
|
-
job_params = _generate_job_params(
|
|
486
|
-
model_endpoints_ids=model_endpoints_ids,
|
|
487
|
-
batch_intervals_dict=batch_intervals_dict,
|
|
488
|
-
)
|
|
489
|
-
|
|
490
|
-
# Generate runtime and trigger the job function
|
|
491
|
-
batch_function = mlrun.new_function(runtime=batch_function_dict)
|
|
492
|
-
batch_function.run(name="model-monitoring-batch", params=job_params, watch=True)
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
def _generate_job_params(
|
|
496
|
-
model_endpoints_ids: list[str],
|
|
497
|
-
batch_intervals_dict: dict[str, float] = None,
|
|
498
|
-
):
|
|
499
|
-
"""
|
|
500
|
-
Generate the required params for the model monitoring batch job function.
|
|
501
|
-
|
|
502
|
-
:param model_endpoints_ids: List of model endpoints to include in the current run.
|
|
503
|
-
:param batch_intervals_dict: Batch interval range (days, hours, minutes). By default, the batch interval is
|
|
504
|
-
configured to run through the last hour.
|
|
505
|
-
|
|
506
|
-
"""
|
|
507
|
-
if not batch_intervals_dict:
|
|
508
|
-
# Generate default batch intervals dict
|
|
509
|
-
batch_intervals_dict = {"minutes": 0, "hours": 1, "days": 0}
|
|
510
|
-
|
|
511
|
-
return {
|
|
512
|
-
"model_endpoints": model_endpoints_ids,
|
|
513
|
-
"batch_intervals_dict": batch_intervals_dict,
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
|
|
517
418
|
def get_sample_set_statistics(
|
|
518
419
|
sample_set: DatasetType = None,
|
|
519
420
|
model_artifact_feature_stats: dict = None,
|
|
@@ -659,145 +560,6 @@ def read_dataset_as_dataframe(
|
|
|
659
560
|
return dataset, label_columns
|
|
660
561
|
|
|
661
562
|
|
|
662
|
-
def perform_drift_analysis(
|
|
663
|
-
project: str,
|
|
664
|
-
endpoint_id: str,
|
|
665
|
-
context: mlrun.MLClientCtx,
|
|
666
|
-
sample_set_statistics: dict,
|
|
667
|
-
drift_threshold: float,
|
|
668
|
-
possible_drift_threshold: float,
|
|
669
|
-
artifacts_tag: str = "",
|
|
670
|
-
db_session=None,
|
|
671
|
-
) -> None:
|
|
672
|
-
"""
|
|
673
|
-
Calculate drift per feature and produce the drift table artifact for logging post prediction. Note that most of
|
|
674
|
-
the calculations were already made through the monitoring batch job.
|
|
675
|
-
|
|
676
|
-
:param project: Project name.
|
|
677
|
-
:param endpoint_id: Model endpoint unique ID.
|
|
678
|
-
:param context: MLRun context. Will log the artifacts.
|
|
679
|
-
:param sample_set_statistics: The statistics of the sample set logged along a model.
|
|
680
|
-
:param drift_threshold: The threshold of which to mark drifts.
|
|
681
|
-
:param possible_drift_threshold: The threshold of which to mark possible drifts.
|
|
682
|
-
:param artifacts_tag: Tag to use for all the artifacts resulted from the function.
|
|
683
|
-
:param db_session: A runtime session that manages the current dialog with the database.
|
|
684
|
-
|
|
685
|
-
"""
|
|
686
|
-
if not db_session:
|
|
687
|
-
db_session = mlrun.get_run_db()
|
|
688
|
-
|
|
689
|
-
model_endpoint = db_session.get_model_endpoint(
|
|
690
|
-
project=project, endpoint_id=endpoint_id
|
|
691
|
-
)
|
|
692
|
-
|
|
693
|
-
# Get the drift metrics results along with the feature statistics from the latest batch
|
|
694
|
-
metrics = model_endpoint.status.drift_measures
|
|
695
|
-
inputs_statistics = model_endpoint.status.current_stats
|
|
696
|
-
|
|
697
|
-
inputs_statistics.pop(EventFieldType.TIMESTAMP, None)
|
|
698
|
-
|
|
699
|
-
# Calculate drift for each feature
|
|
700
|
-
virtual_drift = VirtualDrift()
|
|
701
|
-
drift_results = virtual_drift.check_for_drift_per_feature(
|
|
702
|
-
metrics_results_dictionary=metrics,
|
|
703
|
-
possible_drift_threshold=possible_drift_threshold,
|
|
704
|
-
drift_detected_threshold=drift_threshold,
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
# Drift table artifact
|
|
708
|
-
plotly_artifact = FeaturesDriftTablePlot().produce(
|
|
709
|
-
sample_set_statistics=sample_set_statistics,
|
|
710
|
-
inputs_statistics=inputs_statistics,
|
|
711
|
-
metrics=metrics,
|
|
712
|
-
drift_results=drift_results,
|
|
713
|
-
)
|
|
714
|
-
|
|
715
|
-
# Prepare drift result per feature dictionary
|
|
716
|
-
metrics_per_feature = {
|
|
717
|
-
feature: _get_drift_result(
|
|
718
|
-
tvd=metric_dictionary["tvd"],
|
|
719
|
-
hellinger=metric_dictionary["hellinger"],
|
|
720
|
-
threshold=drift_threshold,
|
|
721
|
-
)[1]
|
|
722
|
-
for feature, metric_dictionary in metrics.items()
|
|
723
|
-
if isinstance(metric_dictionary, dict)
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
# Calculate the final analysis result as well
|
|
727
|
-
drift_status, drift_metric = _get_drift_result(
|
|
728
|
-
tvd=metrics["tvd_mean"],
|
|
729
|
-
hellinger=metrics["hellinger_mean"],
|
|
730
|
-
threshold=drift_threshold,
|
|
731
|
-
)
|
|
732
|
-
# Log the different artifacts
|
|
733
|
-
_log_drift_artifacts(
|
|
734
|
-
context=context,
|
|
735
|
-
plotly_artifact=plotly_artifact,
|
|
736
|
-
metrics_per_feature=metrics_per_feature,
|
|
737
|
-
drift_status=drift_status,
|
|
738
|
-
drift_metric=drift_metric,
|
|
739
|
-
artifacts_tag=artifacts_tag,
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
def _log_drift_artifacts(
|
|
744
|
-
context: mlrun.MLClientCtx,
|
|
745
|
-
plotly_artifact: mlrun.artifacts.Artifact,
|
|
746
|
-
metrics_per_feature: dict[str, float],
|
|
747
|
-
drift_status: bool,
|
|
748
|
-
drift_metric: float,
|
|
749
|
-
artifacts_tag: str,
|
|
750
|
-
):
|
|
751
|
-
"""
|
|
752
|
-
Log the following artifacts/results:
|
|
753
|
-
1 - Drift table plot which includes a detailed drift analysis per feature
|
|
754
|
-
2 - Drift result per feature in a JSON format
|
|
755
|
-
3 - Results of the total drift analysis
|
|
756
|
-
|
|
757
|
-
:param context: MLRun context. Will log the artifacts.
|
|
758
|
-
:param plotly_artifact: The plotly artifact.
|
|
759
|
-
:param metrics_per_feature: Dictionary in which the key is a feature name and the value is the drift numerical
|
|
760
|
-
result.
|
|
761
|
-
:param drift_status: Boolean value that represents the final drift analysis result.
|
|
762
|
-
:param drift_metric: The final drift numerical result.
|
|
763
|
-
:param artifacts_tag: Tag to use for all the artifacts resulted from the function.
|
|
764
|
-
"""
|
|
765
|
-
context.log_artifact(plotly_artifact, tag=artifacts_tag)
|
|
766
|
-
context.log_artifact(
|
|
767
|
-
mlrun.artifacts.Artifact(
|
|
768
|
-
body=json.dumps(metrics_per_feature),
|
|
769
|
-
format="json",
|
|
770
|
-
key="features_drift_results",
|
|
771
|
-
),
|
|
772
|
-
tag=artifacts_tag,
|
|
773
|
-
)
|
|
774
|
-
context.log_results(
|
|
775
|
-
results={"drift_status": drift_status, "drift_metric": drift_metric}
|
|
776
|
-
)
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
def _get_drift_result(
|
|
780
|
-
tvd: float,
|
|
781
|
-
hellinger: float,
|
|
782
|
-
threshold: float,
|
|
783
|
-
) -> tuple[bool, float]:
|
|
784
|
-
"""
|
|
785
|
-
Calculate the drift result by the following equation: (tvd + hellinger) / 2
|
|
786
|
-
|
|
787
|
-
:param tvd: The feature's TVD value.
|
|
788
|
-
:param hellinger: The feature's Hellinger value.
|
|
789
|
-
:param threshold: The threshold from which the value is considered a drift.
|
|
790
|
-
|
|
791
|
-
:returns: A tuple of:
|
|
792
|
-
[0] = Boolean value as the drift status.
|
|
793
|
-
[1] = The result.
|
|
794
|
-
"""
|
|
795
|
-
result = (tvd + hellinger) / 2
|
|
796
|
-
if result >= threshold:
|
|
797
|
-
return True, result
|
|
798
|
-
return False, result
|
|
799
|
-
|
|
800
|
-
|
|
801
563
|
def log_result(
|
|
802
564
|
context: mlrun.MLClientCtx,
|
|
803
565
|
result_set_name: str,
|
|
@@ -820,3 +582,50 @@ def log_result(
|
|
|
820
582
|
key="batch_id",
|
|
821
583
|
value=batch_id,
|
|
822
584
|
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _create_model_monitoring_function_base(
|
|
588
|
+
*,
|
|
589
|
+
project: str,
|
|
590
|
+
func: typing.Union[str, None] = None,
|
|
591
|
+
application_class: typing.Union[
|
|
592
|
+
str, mlrun.model_monitoring.application.ModelMonitoringApplicationBase, None
|
|
593
|
+
] = None,
|
|
594
|
+
name: typing.Optional[str] = None,
|
|
595
|
+
image: typing.Optional[str] = None,
|
|
596
|
+
tag: typing.Optional[str] = None,
|
|
597
|
+
requirements: typing.Union[str, list[str], None] = None,
|
|
598
|
+
requirements_file: str = "",
|
|
599
|
+
**application_kwargs,
|
|
600
|
+
) -> mlrun.runtimes.ServingRuntime:
|
|
601
|
+
"""
|
|
602
|
+
Note: this is an internal API only.
|
|
603
|
+
This function does not set the labels or mounts v3io.
|
|
604
|
+
"""
|
|
605
|
+
if func is None:
|
|
606
|
+
func = ""
|
|
607
|
+
func_obj = typing.cast(
|
|
608
|
+
mlrun.runtimes.ServingRuntime,
|
|
609
|
+
mlrun.code_to_function(
|
|
610
|
+
filename=func,
|
|
611
|
+
name=name,
|
|
612
|
+
project=project,
|
|
613
|
+
tag=tag,
|
|
614
|
+
kind=mlrun.run.RuntimeKinds.serving,
|
|
615
|
+
image=image,
|
|
616
|
+
requirements=requirements,
|
|
617
|
+
requirements_file=requirements_file,
|
|
618
|
+
),
|
|
619
|
+
)
|
|
620
|
+
graph = func_obj.set_topology(mlrun.serving.states.StepKinds.flow)
|
|
621
|
+
if isinstance(application_class, str):
|
|
622
|
+
first_step = graph.to(class_name=application_class, **application_kwargs)
|
|
623
|
+
else:
|
|
624
|
+
first_step = graph.to(class_name=application_class)
|
|
625
|
+
first_step.to(
|
|
626
|
+
class_name="mlrun.model_monitoring.application.PushToMonitoringWriter",
|
|
627
|
+
name="PushToMonitoringWriter",
|
|
628
|
+
project=project,
|
|
629
|
+
writer_application_name=mm_consts.MonitoringFunctionNames.WRITER,
|
|
630
|
+
).respond()
|
|
631
|
+
return func_obj
|
|
@@ -21,12 +21,12 @@ from collections.abc import Iterator
|
|
|
21
21
|
from typing import Any, NamedTuple, Optional, Union, cast
|
|
22
22
|
|
|
23
23
|
import nuclio
|
|
24
|
-
from v3io.dataplane.response import HttpResponseError
|
|
25
24
|
|
|
26
25
|
import mlrun
|
|
27
26
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
28
27
|
import mlrun.data_types.infer
|
|
29
28
|
import mlrun.feature_store as fstore
|
|
29
|
+
import mlrun.model_monitoring.db.stores
|
|
30
30
|
from mlrun.common.model_monitoring.helpers import FeatureStats, pad_features_hist
|
|
31
31
|
from mlrun.datastore import get_stream_pusher
|
|
32
32
|
from mlrun.datastore.targets import ParquetTarget
|
|
@@ -38,8 +38,7 @@ from mlrun.model_monitoring.helpers import (
|
|
|
38
38
|
get_monitoring_parquet_path,
|
|
39
39
|
get_stream_path,
|
|
40
40
|
)
|
|
41
|
-
from mlrun.utils import
|
|
42
|
-
from mlrun.utils.v3io_clients import get_v3io_client
|
|
41
|
+
from mlrun.utils import datetime_now, logger
|
|
43
42
|
|
|
44
43
|
|
|
45
44
|
class _Interval(NamedTuple):
|
|
@@ -48,8 +47,6 @@ class _Interval(NamedTuple):
|
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
class _BatchWindow:
|
|
51
|
-
V3IO_CONTAINER_FORMAT = "users/pipelines/{project}/monitoring-schedules/functions"
|
|
52
|
-
|
|
53
50
|
def __init__(
|
|
54
51
|
self,
|
|
55
52
|
project: str,
|
|
@@ -65,27 +62,22 @@ class _BatchWindow:
|
|
|
65
62
|
All the time values are in seconds.
|
|
66
63
|
The start and stop time are in seconds since the epoch.
|
|
67
64
|
"""
|
|
65
|
+
self.project = project
|
|
68
66
|
self._endpoint = endpoint
|
|
69
67
|
self._application = application
|
|
70
68
|
self._first_request = first_request
|
|
71
|
-
self._kv_storage = get_v3io_client(
|
|
72
|
-
endpoint=mlrun.mlconf.v3io_api,
|
|
73
|
-
# Avoid noisy warning logs before the KV table is created
|
|
74
|
-
logger=create_logger(name="v3io_client", level="error"),
|
|
75
|
-
).kv
|
|
76
|
-
self._v3io_container = self.V3IO_CONTAINER_FORMAT.format(project=project)
|
|
77
69
|
self._stop = last_updated
|
|
78
70
|
self._step = timedelta_seconds
|
|
71
|
+
self._db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
79
72
|
self._start = self._get_last_analyzed()
|
|
80
73
|
|
|
81
74
|
def _get_last_analyzed(self) -> Optional[int]:
|
|
82
75
|
try:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
key=self._application,
|
|
76
|
+
last_analyzed = self._db.get_last_analyzed(
|
|
77
|
+
endpoint_id=self._endpoint,
|
|
78
|
+
application_name=self._application,
|
|
87
79
|
)
|
|
88
|
-
except
|
|
80
|
+
except mlrun.errors.MLRunNotFoundError:
|
|
89
81
|
logger.info(
|
|
90
82
|
"No last analyzed time was found for this endpoint and "
|
|
91
83
|
"application, as this is probably the first time this "
|
|
@@ -96,7 +88,7 @@ class _BatchWindow:
|
|
|
96
88
|
first_request=self._first_request,
|
|
97
89
|
last_updated=self._stop,
|
|
98
90
|
)
|
|
99
|
-
|
|
91
|
+
|
|
100
92
|
if self._first_request and self._stop:
|
|
101
93
|
# TODO : Change the timedelta according to the policy.
|
|
102
94
|
first_period_in_seconds = max(
|
|
@@ -108,7 +100,6 @@ class _BatchWindow:
|
|
|
108
100
|
)
|
|
109
101
|
return self._first_request
|
|
110
102
|
|
|
111
|
-
last_analyzed = data.output.item[mm_constants.SchedulingKeys.LAST_ANALYZED]
|
|
112
103
|
logger.info(
|
|
113
104
|
"Got the last analyzed time for this endpoint and application",
|
|
114
105
|
endpoint=self._endpoint,
|
|
@@ -124,11 +115,11 @@ class _BatchWindow:
|
|
|
124
115
|
application=self._application,
|
|
125
116
|
last_analyzed=last_analyzed,
|
|
126
117
|
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
118
|
+
|
|
119
|
+
self._db.update_last_analyzed(
|
|
120
|
+
endpoint_id=self._endpoint,
|
|
121
|
+
application_name=self._application,
|
|
122
|
+
last_analyzed=last_analyzed,
|
|
132
123
|
)
|
|
133
124
|
|
|
134
125
|
def get_intervals(
|
|
@@ -301,7 +292,7 @@ class MonitoringApplicationController:
|
|
|
301
292
|
f"Initializing {self.__class__.__name__}", project=project
|
|
302
293
|
)
|
|
303
294
|
|
|
304
|
-
self.db = mlrun.model_monitoring.
|
|
295
|
+
self.db = mlrun.model_monitoring.get_store_object(project=project)
|
|
305
296
|
|
|
306
297
|
self._batch_window_generator = _BatchWindowGenerator(
|
|
307
298
|
batch_dict=json.loads(
|
|
@@ -359,7 +350,12 @@ class MonitoringApplicationController:
|
|
|
359
350
|
{
|
|
360
351
|
app.metadata.name
|
|
361
352
|
for app in monitoring_functions
|
|
362
|
-
if
|
|
353
|
+
if (
|
|
354
|
+
app.status.state == "ready"
|
|
355
|
+
# workaround for the default app, as its `status.state` is `None`
|
|
356
|
+
or app.metadata.name
|
|
357
|
+
== mm_constants.MLRUN_HISTOGRAM_DATA_DRIFT_APP_NAME
|
|
358
|
+
)
|
|
363
359
|
}
|
|
364
360
|
)
|
|
365
361
|
if not applications_names:
|
|
@@ -367,6 +363,10 @@ class MonitoringApplicationController:
|
|
|
367
363
|
"No monitoring functions found", project=self.project
|
|
368
364
|
)
|
|
369
365
|
return
|
|
366
|
+
self.context.logger.info(
|
|
367
|
+
"Starting to iterate over the applications",
|
|
368
|
+
applications=applications_names,
|
|
369
|
+
)
|
|
370
370
|
|
|
371
371
|
except Exception as e:
|
|
372
372
|
self.context.logger.error(
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Copyright 2024 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .stores import ObjectStoreFactory, get_store_object
|
|
16
|
+
from .stores.base import StoreBase
|