mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +24 -3
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +14 -0
- mlrun/common/model_monitoring/helpers.py +123 -0
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +14 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +12 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +34 -0
- mlrun/common/schemas/hub.py +33 -20
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +12 -15
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +69 -19
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +12 -5
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +7 -2
- mlrun/datastore/datastore_profile.py +84 -22
- mlrun/datastore/model_provider/huggingface_provider.py +225 -41
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +206 -74
- mlrun/datastore/model_provider/openai_provider.py +226 -66
- mlrun/datastore/s3.py +39 -18
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +17 -12
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +25 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +63 -32
- mlrun/db/httpdb.py +373 -153
- mlrun/db/nopdb.py +54 -21
- mlrun/errors.py +4 -2
- mlrun/execution.py +66 -25
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +52 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +172 -0
- mlrun/hub/step.py +113 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +15 -7
- mlrun/launcher/local.py +4 -1
- mlrun/model.py +14 -4
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +65 -28
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +299 -128
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/controller.py +132 -58
- mlrun/model_monitoring/db/_schedules.py +38 -29
- mlrun/model_monitoring/db/_stats.py +6 -16
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +30 -6
- mlrun/model_monitoring/stream_processing.py +34 -28
- mlrun/model_monitoring/writer.py +224 -4
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -43
- mlrun/platforms/iguazio.py +8 -4
- mlrun/projects/operations.py +17 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +187 -123
- mlrun/run.py +95 -21
- mlrun/runtimes/__init__.py +2 -186
- mlrun/runtimes/base.py +103 -25
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +5 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +12 -7
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +339 -40
- mlrun/runtimes/nuclio/function.py +222 -72
- mlrun/runtimes/nuclio/serving.py +132 -42
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +99 -14
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +84 -11
- mlrun/serving/routers.py +26 -44
- mlrun/serving/server.py +138 -51
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +997 -283
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +149 -95
- mlrun/serving/v2_serving.py +9 -10
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +292 -94
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +3 -5
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +3 -3
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
- mlrun/api/schemas/__init__.py +0 -259
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
|
@@ -18,7 +18,7 @@ from abc import ABC, abstractmethod
|
|
|
18
18
|
from collections import defaultdict
|
|
19
19
|
from collections.abc import Iterator
|
|
20
20
|
from contextlib import contextmanager, nullcontext
|
|
21
|
-
from datetime import datetime, timedelta
|
|
21
|
+
from datetime import UTC, datetime, timedelta
|
|
22
22
|
from typing import Any, Literal, Optional, Union, cast
|
|
23
23
|
|
|
24
24
|
import pandas as pd
|
|
@@ -27,6 +27,7 @@ import mlrun
|
|
|
27
27
|
import mlrun.common.constants as mlrun_constants
|
|
28
28
|
import mlrun.common.helpers
|
|
29
29
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
30
|
+
import mlrun.common.types
|
|
30
31
|
import mlrun.datastore.datastore_profile as ds_profile
|
|
31
32
|
import mlrun.errors
|
|
32
33
|
import mlrun.model_monitoring.api as mm_api
|
|
@@ -39,6 +40,12 @@ from mlrun.serving.utils import MonitoringApplicationToDict
|
|
|
39
40
|
from mlrun.utils import logger
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
class ExistingDataHandling(mlrun.common.types.StrEnum):
|
|
44
|
+
fail_on_overlap = "fail_on_overlap"
|
|
45
|
+
skip_overlap = "skip_overlap"
|
|
46
|
+
delete_all = "delete_all"
|
|
47
|
+
|
|
48
|
+
|
|
42
49
|
def _serialize_context_and_result(
|
|
43
50
|
*,
|
|
44
51
|
context: mm_context.MonitoringApplicationContext,
|
|
@@ -226,7 +233,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
226
233
|
try:
|
|
227
234
|
yield endpoints_output, application_schedules.__enter__()
|
|
228
235
|
finally:
|
|
229
|
-
if write_output:
|
|
236
|
+
if write_output and any(endpoints_output.values()):
|
|
230
237
|
logger.debug(
|
|
231
238
|
"Pushing model monitoring application job data to the writer stream",
|
|
232
239
|
passed_stream_profile=str(stream_profile),
|
|
@@ -288,7 +295,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
288
295
|
end: Optional[str] = None,
|
|
289
296
|
base_period: Optional[int] = None,
|
|
290
297
|
write_output: bool = False,
|
|
291
|
-
|
|
298
|
+
existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
|
|
292
299
|
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
293
300
|
):
|
|
294
301
|
"""
|
|
@@ -325,21 +332,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
325
332
|
project=project,
|
|
326
333
|
) as (endpoints_output, application_schedules):
|
|
327
334
|
|
|
328
|
-
def call_do_tracking(
|
|
335
|
+
def call_do_tracking(
|
|
336
|
+
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
337
|
+
):
|
|
329
338
|
nonlocal endpoints_output
|
|
330
339
|
|
|
331
|
-
if event is None:
|
|
332
|
-
event = {}
|
|
333
|
-
monitoring_context = (
|
|
334
|
-
mm_context.MonitoringApplicationContext._from_ml_ctx(
|
|
335
|
-
event=event,
|
|
336
|
-
application_name=application_name,
|
|
337
|
-
context=context,
|
|
338
|
-
project=project,
|
|
339
|
-
sample_df=sample_data,
|
|
340
|
-
feature_stats=feature_stats,
|
|
341
|
-
)
|
|
342
|
-
)
|
|
343
340
|
result = self.do_tracking(monitoring_context)
|
|
344
341
|
endpoints_output[monitoring_context.endpoint_id].append(
|
|
345
342
|
(monitoring_context, result)
|
|
@@ -347,99 +344,184 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
347
344
|
return result
|
|
348
345
|
|
|
349
346
|
if endpoints is not None:
|
|
350
|
-
resolved_endpoints = self.
|
|
347
|
+
resolved_endpoints = self._normalize_and_validate_endpoints(
|
|
351
348
|
project=project, endpoints=endpoints
|
|
352
349
|
)
|
|
350
|
+
if (
|
|
351
|
+
write_output
|
|
352
|
+
and existing_data_handling == ExistingDataHandling.delete_all
|
|
353
|
+
):
|
|
354
|
+
endpoint_ids = [
|
|
355
|
+
endpoint_id for _, endpoint_id in resolved_endpoints
|
|
356
|
+
]
|
|
357
|
+
context.logger.info(
|
|
358
|
+
"Deleting all the application data before running the application",
|
|
359
|
+
application_name=application_name,
|
|
360
|
+
endpoint_ids=endpoint_ids,
|
|
361
|
+
)
|
|
362
|
+
self._delete_application_data(
|
|
363
|
+
project_name=project.name,
|
|
364
|
+
application_name=application_name,
|
|
365
|
+
endpoint_ids=endpoint_ids,
|
|
366
|
+
application_schedules=application_schedules,
|
|
367
|
+
)
|
|
353
368
|
for endpoint_name, endpoint_id in resolved_endpoints:
|
|
354
|
-
for
|
|
369
|
+
for monitoring_ctx in self._window_generator(
|
|
355
370
|
start=start,
|
|
356
371
|
end=end,
|
|
357
372
|
base_period=base_period,
|
|
358
373
|
application_schedules=application_schedules,
|
|
359
374
|
endpoint_id=endpoint_id,
|
|
375
|
+
endpoint_name=endpoint_name,
|
|
360
376
|
application_name=application_name,
|
|
361
|
-
|
|
377
|
+
existing_data_handling=existing_data_handling,
|
|
378
|
+
sample_data=sample_data,
|
|
379
|
+
context=context,
|
|
380
|
+
project=project,
|
|
362
381
|
):
|
|
363
|
-
result = call_do_tracking(
|
|
364
|
-
event={
|
|
365
|
-
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
366
|
-
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
367
|
-
mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
|
|
368
|
-
mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
|
|
369
|
-
}
|
|
370
|
-
)
|
|
382
|
+
result = call_do_tracking(monitoring_ctx)
|
|
371
383
|
result_key = (
|
|
372
|
-
f"{endpoint_name}-{endpoint_id}_{
|
|
373
|
-
if
|
|
384
|
+
f"{endpoint_name}-{endpoint_id}_{monitoring_ctx.start_infer_time.isoformat()}_{monitoring_ctx.end_infer_time.isoformat()}"
|
|
385
|
+
if monitoring_ctx.start_infer_time
|
|
386
|
+
and monitoring_ctx.end_infer_time
|
|
374
387
|
else f"{endpoint_name}-{endpoint_id}"
|
|
375
388
|
)
|
|
376
389
|
|
|
377
390
|
context.log_result(
|
|
378
391
|
result_key, self._flatten_data_result(result)
|
|
379
392
|
)
|
|
393
|
+
# Check if no result was produced for any endpoint (e.g., due to no data in all windows)
|
|
394
|
+
if not any(endpoints_output.values()):
|
|
395
|
+
context.logger.warning(
|
|
396
|
+
"No data was found for any of the specified endpoints. "
|
|
397
|
+
"No results were produced",
|
|
398
|
+
application_name=application_name,
|
|
399
|
+
endpoints=endpoints,
|
|
400
|
+
start=start,
|
|
401
|
+
end=end,
|
|
402
|
+
)
|
|
380
403
|
else:
|
|
381
|
-
|
|
404
|
+
result = call_do_tracking(
|
|
405
|
+
mm_context.MonitoringApplicationContext._from_ml_ctx(
|
|
406
|
+
context=context,
|
|
407
|
+
project=project,
|
|
408
|
+
application_name=application_name,
|
|
409
|
+
event={},
|
|
410
|
+
sample_df=sample_data,
|
|
411
|
+
feature_stats=feature_stats,
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
return self._flatten_data_result(result)
|
|
382
415
|
|
|
383
416
|
@staticmethod
|
|
384
|
-
def
|
|
417
|
+
def _check_endpoints_first_request(
|
|
418
|
+
endpoints: list[mlrun.common.schemas.ModelEndpoint],
|
|
419
|
+
) -> None:
|
|
420
|
+
"""Make sure that all the endpoints have had at least one request"""
|
|
421
|
+
endpoints_no_requests = [
|
|
422
|
+
(endpoint.metadata.name, endpoint.metadata.uid)
|
|
423
|
+
for endpoint in endpoints
|
|
424
|
+
if not endpoint.status.first_request
|
|
425
|
+
]
|
|
426
|
+
if endpoints_no_requests:
|
|
427
|
+
raise mlrun.errors.MLRunValueError(
|
|
428
|
+
"The following model endpoints have not had any requests yet and "
|
|
429
|
+
"have no data, cannot run the model monitoring application on them: "
|
|
430
|
+
f"{endpoints_no_requests}"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
@classmethod
|
|
434
|
+
def _normalize_and_validate_endpoints(
|
|
435
|
+
cls,
|
|
385
436
|
project: "mlrun.MlrunProject",
|
|
386
437
|
endpoints: Union[
|
|
387
438
|
list[tuple[str, str]], list[list[str]], list[str], Literal["all"]
|
|
388
439
|
],
|
|
389
|
-
) ->
|
|
390
|
-
if
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
440
|
+
) -> list[tuple[str, str]]:
|
|
441
|
+
if isinstance(endpoints, list):
|
|
442
|
+
if all(
|
|
443
|
+
isinstance(endpoint, tuple | list) and len(endpoint) == 2
|
|
444
|
+
for endpoint in endpoints
|
|
445
|
+
):
|
|
446
|
+
# A list of [(name, uid), ...] / [[name, uid], ...] tuples/lists
|
|
447
|
+
endpoint_uids_to_names = {
|
|
448
|
+
endpoint[1]: endpoint[0] for endpoint in endpoints
|
|
449
|
+
}
|
|
450
|
+
endpoints_list = project.list_model_endpoints(
|
|
451
|
+
uids=list(endpoint_uids_to_names.keys()), latest_only=True
|
|
452
|
+
).endpoints
|
|
453
|
+
|
|
454
|
+
# Check for missing endpoint uids or name/uid mismatches
|
|
455
|
+
for endpoint in endpoints_list:
|
|
456
|
+
if (
|
|
457
|
+
endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]
|
|
458
|
+
!= endpoint.metadata.name
|
|
459
|
+
):
|
|
460
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
461
|
+
"Could not find model endpoint with name "
|
|
462
|
+
f"'{endpoint_uids_to_names[cast(str, endpoint.metadata.uid)]}' "
|
|
463
|
+
f"and uid '{endpoint.metadata.uid}'"
|
|
464
|
+
)
|
|
465
|
+
missing = set(endpoint_uids_to_names.keys()) - {
|
|
466
|
+
cast(str, endpoint.metadata.uid) for endpoint in endpoints_list
|
|
467
|
+
}
|
|
468
|
+
if missing:
|
|
469
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
470
|
+
"Could not find model endpoints with the following uids: "
|
|
471
|
+
f"{missing}"
|
|
406
472
|
)
|
|
407
|
-
else:
|
|
408
|
-
raise mlrun.errors.MLRunValueError(
|
|
409
|
-
f"Could not resolve endpoints as list of [(name, uid)], {endpoints=}"
|
|
410
|
-
)
|
|
411
473
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
if endpoints_list:
|
|
421
|
-
list_endpoints_result = [
|
|
422
|
-
(endpoint.metadata.name, endpoint.metadata.uid)
|
|
423
|
-
for endpoint in endpoints_list
|
|
424
|
-
]
|
|
425
|
-
if endpoints != "all":
|
|
474
|
+
elif all(isinstance(endpoint, str) for endpoint in endpoints):
|
|
475
|
+
# A list of [name, ...] strings
|
|
476
|
+
endpoint_names = cast(list[str], endpoints)
|
|
477
|
+
endpoints_list = project.list_model_endpoints(
|
|
478
|
+
names=endpoint_names, latest_only=True
|
|
479
|
+
).endpoints
|
|
480
|
+
|
|
481
|
+
# Check for missing endpoint names
|
|
426
482
|
missing = set(endpoints) - {
|
|
427
|
-
endpoint
|
|
483
|
+
endpoint.metadata.name for endpoint in endpoints_list
|
|
428
484
|
}
|
|
429
485
|
if missing:
|
|
430
486
|
logger.warning(
|
|
431
487
|
"Could not list all the required endpoints",
|
|
432
|
-
|
|
433
|
-
|
|
488
|
+
missing_endpoints=missing,
|
|
489
|
+
endpoints_list=endpoints_list,
|
|
434
490
|
)
|
|
435
|
-
|
|
491
|
+
else:
|
|
492
|
+
raise mlrun.errors.MLRunValueError(
|
|
493
|
+
"Could not resolve the following list as a list of endpoints:\n"
|
|
494
|
+
f"{endpoints}\n"
|
|
495
|
+
"The list must be either a list of (name, uid) tuples/lists or a list of names."
|
|
496
|
+
)
|
|
497
|
+
elif endpoints == "all":
|
|
498
|
+
endpoints_list = project.list_model_endpoints(latest_only=True).endpoints
|
|
499
|
+
elif isinstance(endpoints, str):
|
|
500
|
+
raise mlrun.errors.MLRunValueError(
|
|
501
|
+
'A string input for `endpoints` can only be "all" for all the model endpoints in '
|
|
502
|
+
"the project. If you want to select a single model endpoint with the given name, "
|
|
503
|
+
f'use a list: `endpoints=["{endpoints}"]`.'
|
|
504
|
+
)
|
|
436
505
|
else:
|
|
437
|
-
|
|
438
|
-
|
|
506
|
+
raise mlrun.errors.MLRunValueError(
|
|
507
|
+
"Could not resolve the `endpoints` parameter. The parameter must be either:\n"
|
|
508
|
+
"- a list of (name, uid) tuples/lists\n"
|
|
509
|
+
"- a list of names\n"
|
|
510
|
+
'- the string "all" for all the model endpoints in the project.'
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
if not endpoints_list:
|
|
439
514
|
raise mlrun.errors.MLRunNotFoundError(
|
|
440
|
-
f"Did not find any model endpoints {
|
|
515
|
+
f"Did not find any model endpoints {endpoints=}"
|
|
441
516
|
)
|
|
442
517
|
|
|
518
|
+
cls._check_endpoints_first_request(endpoints_list)
|
|
519
|
+
|
|
520
|
+
return [
|
|
521
|
+
(endpoint.metadata.name, cast(str, endpoint.metadata.uid))
|
|
522
|
+
for endpoint in endpoints_list
|
|
523
|
+
]
|
|
524
|
+
|
|
443
525
|
@staticmethod
|
|
444
526
|
def _validate_and_get_window_length(
|
|
445
527
|
*, base_period: int, start_dt: datetime, end_dt: datetime
|
|
@@ -481,7 +563,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
481
563
|
end_dt: datetime,
|
|
482
564
|
base_period: Optional[int],
|
|
483
565
|
application_name: str,
|
|
484
|
-
|
|
566
|
+
existing_data_handling: ExistingDataHandling,
|
|
485
567
|
) -> datetime:
|
|
486
568
|
"""Make sure that the (app, endpoint) pair doesn't write output before the last analyzed window"""
|
|
487
569
|
if application_schedules:
|
|
@@ -490,7 +572,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
490
572
|
)
|
|
491
573
|
if last_analyzed:
|
|
492
574
|
if start_dt < last_analyzed:
|
|
493
|
-
if
|
|
575
|
+
if existing_data_handling == ExistingDataHandling.skip_overlap:
|
|
494
576
|
if last_analyzed < end_dt and base_period is None:
|
|
495
577
|
logger.warn(
|
|
496
578
|
"Setting the start time to last_analyzed since the original start time precedes "
|
|
@@ -504,15 +586,17 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
504
586
|
else:
|
|
505
587
|
raise mlrun.errors.MLRunValueError(
|
|
506
588
|
"The start time for the application and endpoint precedes the last analyzed time: "
|
|
507
|
-
f"{start_dt
|
|
589
|
+
f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
|
|
590
|
+
f"{endpoint_id=}. "
|
|
508
591
|
"Writing data out of order is not supported, and the start time could not be "
|
|
509
592
|
"dynamically reset, as last_analyzed is later than the given end time or that "
|
|
510
|
-
f"base_period was specified ({end_dt
|
|
593
|
+
f"base_period was specified (end_dt='{end_dt}', {base_period=})."
|
|
511
594
|
)
|
|
512
595
|
else:
|
|
513
596
|
raise mlrun.errors.MLRunValueError(
|
|
514
597
|
"The start time for the application and endpoint precedes the last analyzed time: "
|
|
515
|
-
f"{start_dt
|
|
598
|
+
f"start_dt='{start_dt}', last_analyzed='{last_analyzed}', {application_name=}, "
|
|
599
|
+
f"{endpoint_id=}. "
|
|
516
600
|
"Writing data out of order is not supported. You should change the start time to "
|
|
517
601
|
f"'{last_analyzed}' or later."
|
|
518
602
|
)
|
|
@@ -525,6 +609,25 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
525
609
|
)
|
|
526
610
|
return start_dt
|
|
527
611
|
|
|
612
|
+
@staticmethod
|
|
613
|
+
def _delete_application_data(
|
|
614
|
+
project_name: str,
|
|
615
|
+
application_name: str,
|
|
616
|
+
endpoint_ids: list[str],
|
|
617
|
+
application_schedules: Optional[
|
|
618
|
+
mm_schedules.ModelMonitoringSchedulesFileApplication
|
|
619
|
+
],
|
|
620
|
+
) -> None:
|
|
621
|
+
mlrun.get_run_db().delete_model_monitoring_metrics(
|
|
622
|
+
project=project_name,
|
|
623
|
+
application_name=application_name,
|
|
624
|
+
endpoint_ids=endpoint_ids,
|
|
625
|
+
)
|
|
626
|
+
if application_schedules:
|
|
627
|
+
application_schedules.delete_endpoints_last_analyzed(
|
|
628
|
+
endpoint_uids=endpoint_ids
|
|
629
|
+
)
|
|
630
|
+
|
|
528
631
|
@classmethod
|
|
529
632
|
def _window_generator(
|
|
530
633
|
cls,
|
|
@@ -535,34 +638,79 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
535
638
|
application_schedules: Optional[
|
|
536
639
|
mm_schedules.ModelMonitoringSchedulesFileApplication
|
|
537
640
|
],
|
|
641
|
+
endpoint_name: str,
|
|
538
642
|
endpoint_id: str,
|
|
539
643
|
application_name: str,
|
|
540
|
-
|
|
541
|
-
|
|
644
|
+
existing_data_handling: ExistingDataHandling,
|
|
645
|
+
context: "mlrun.MLClientCtx",
|
|
646
|
+
project: "mlrun.MlrunProject",
|
|
647
|
+
sample_data: Optional[pd.DataFrame],
|
|
648
|
+
) -> Iterator[mm_context.MonitoringApplicationContext]:
|
|
649
|
+
def yield_monitoring_ctx(
|
|
650
|
+
window_start: Optional[datetime], window_end: Optional[datetime]
|
|
651
|
+
) -> Iterator[mm_context.MonitoringApplicationContext]:
|
|
652
|
+
ctx = mm_context.MonitoringApplicationContext._from_ml_ctx(
|
|
653
|
+
event={
|
|
654
|
+
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
655
|
+
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
656
|
+
mm_constants.ApplicationEvent.START_INFER_TIME: window_start,
|
|
657
|
+
mm_constants.ApplicationEvent.END_INFER_TIME: window_end,
|
|
658
|
+
},
|
|
659
|
+
application_name=application_name,
|
|
660
|
+
context=context,
|
|
661
|
+
project=project,
|
|
662
|
+
sample_df=sample_data,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
if ctx.sample_df.empty:
|
|
666
|
+
# The current sample is empty
|
|
667
|
+
context.logger.debug(
|
|
668
|
+
"No sample data available for tracking",
|
|
669
|
+
application_name=application_name,
|
|
670
|
+
endpoint_id=ctx.endpoint_id,
|
|
671
|
+
start_time=ctx.start_infer_time,
|
|
672
|
+
end_time=ctx.end_infer_time,
|
|
673
|
+
)
|
|
674
|
+
return
|
|
675
|
+
|
|
676
|
+
yield ctx
|
|
677
|
+
|
|
678
|
+
if application_schedules and window_end:
|
|
679
|
+
application_schedules.update_endpoint_last_analyzed(
|
|
680
|
+
endpoint_uid=endpoint_id, last_analyzed=window_end
|
|
681
|
+
)
|
|
682
|
+
|
|
542
683
|
if start is None or end is None:
|
|
543
684
|
# A single window based on the `sample_data` input - see `_handler`.
|
|
544
|
-
yield None, None
|
|
685
|
+
yield from yield_monitoring_ctx(None, None)
|
|
545
686
|
return
|
|
546
687
|
|
|
547
688
|
start_dt = datetime.fromisoformat(start)
|
|
548
689
|
end_dt = datetime.fromisoformat(end)
|
|
549
690
|
|
|
550
|
-
start_dt
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
691
|
+
# If `start_dt` and `end_dt` do not include time zone information - change them to UTC
|
|
692
|
+
if (start_dt.tzinfo is None) and (end_dt.tzinfo is None):
|
|
693
|
+
start_dt = start_dt.replace(tzinfo=UTC)
|
|
694
|
+
end_dt = end_dt.replace(tzinfo=UTC)
|
|
695
|
+
elif (start_dt.tzinfo is None) or (end_dt.tzinfo is None):
|
|
696
|
+
raise mlrun.errors.MLRunValueError(
|
|
697
|
+
"The start and end times must either both include time zone information or both be naive (no time "
|
|
698
|
+
f"zone). Asserting the above failed, aborting the evaluate request: start={start}, end={end}."
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
if existing_data_handling != ExistingDataHandling.delete_all:
|
|
702
|
+
start_dt = cls._validate_monotonically_increasing_data(
|
|
703
|
+
application_schedules=application_schedules,
|
|
704
|
+
endpoint_id=endpoint_id,
|
|
705
|
+
start_dt=start_dt,
|
|
706
|
+
end_dt=end_dt,
|
|
707
|
+
base_period=base_period,
|
|
708
|
+
application_name=application_name,
|
|
709
|
+
existing_data_handling=existing_data_handling,
|
|
710
|
+
)
|
|
559
711
|
|
|
560
712
|
if base_period is None:
|
|
561
|
-
yield start_dt, end_dt
|
|
562
|
-
if application_schedules:
|
|
563
|
-
application_schedules.update_endpoint_last_analyzed(
|
|
564
|
-
endpoint_uid=endpoint_id, last_analyzed=end_dt
|
|
565
|
-
)
|
|
713
|
+
yield from yield_monitoring_ctx(start_dt, end_dt)
|
|
566
714
|
return
|
|
567
715
|
|
|
568
716
|
window_length = cls._validate_and_get_window_length(
|
|
@@ -572,11 +720,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
572
720
|
current_start_time = start_dt
|
|
573
721
|
while current_start_time < end_dt:
|
|
574
722
|
current_end_time = min(current_start_time + window_length, end_dt)
|
|
575
|
-
yield current_start_time, current_end_time
|
|
576
|
-
if application_schedules:
|
|
577
|
-
application_schedules.update_endpoint_last_analyzed(
|
|
578
|
-
endpoint_uid=endpoint_id, last_analyzed=current_end_time
|
|
579
|
-
)
|
|
723
|
+
yield from yield_monitoring_ctx(current_start_time, current_end_time)
|
|
580
724
|
current_start_time = current_end_time
|
|
581
725
|
|
|
582
726
|
@classmethod
|
|
@@ -647,7 +791,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
647
791
|
else:
|
|
648
792
|
class_name = handler_to_class.split(".")[-1].split("::")[0]
|
|
649
793
|
|
|
650
|
-
job_name = mlrun.utils.normalize_name(class_name
|
|
794
|
+
job_name = mlrun.utils.normalize_name(class_name)
|
|
651
795
|
|
|
652
796
|
if not mm_constants.APP_NAME_REGEX.fullmatch(job_name):
|
|
653
797
|
raise mlrun.errors.MLRunValueError(
|
|
@@ -655,10 +799,13 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
655
799
|
f"`{mm_constants.APP_NAME_REGEX.pattern}`. "
|
|
656
800
|
"Please choose another `func_name`."
|
|
657
801
|
)
|
|
658
|
-
|
|
659
|
-
job_name
|
|
802
|
+
job_name, was_renamed, suffix = mlrun.utils.helpers.ensure_batch_job_suffix(
|
|
803
|
+
job_name
|
|
804
|
+
)
|
|
805
|
+
if was_renamed:
|
|
660
806
|
mlrun.utils.logger.info(
|
|
661
|
-
'Changing function name - adding `"
|
|
807
|
+
f'Changing function name - adding `"{suffix}"` suffix',
|
|
808
|
+
func_name=job_name,
|
|
662
809
|
)
|
|
663
810
|
|
|
664
811
|
return job_name
|
|
@@ -702,7 +849,12 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
702
849
|
* ``end``, ``datetime``
|
|
703
850
|
* ``base_period``, ``int``
|
|
704
851
|
* ``write_output``, ``bool``
|
|
705
|
-
* ``
|
|
852
|
+
* ``existing_data_handling``, ``str``
|
|
853
|
+
* ``_init_args``, ``dict`` - the arguments for the application class constructor
|
|
854
|
+
(equivalent to ``class_arguments``)
|
|
855
|
+
|
|
856
|
+
See :py:meth:`~ModelMonitoringApplicationBase.evaluate` for more details
|
|
857
|
+
about these inputs and params.
|
|
706
858
|
|
|
707
859
|
For Git sources, add the source archive to the returned job and change the handler:
|
|
708
860
|
|
|
@@ -781,6 +933,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
781
933
|
image: Optional[str] = None,
|
|
782
934
|
with_repo: Optional[bool] = False,
|
|
783
935
|
class_handler: Optional[str] = None,
|
|
936
|
+
class_arguments: Optional[dict[str, Any]] = None,
|
|
784
937
|
requirements: Optional[Union[str, list[str]]] = None,
|
|
785
938
|
requirements_file: str = "",
|
|
786
939
|
endpoints: Union[list[tuple[str, str]], list[str], Literal["all"], None] = None,
|
|
@@ -788,7 +941,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
788
941
|
end: Optional[datetime] = None,
|
|
789
942
|
base_period: Optional[int] = None,
|
|
790
943
|
write_output: bool = False,
|
|
791
|
-
|
|
944
|
+
existing_data_handling: ExistingDataHandling = ExistingDataHandling.fail_on_overlap,
|
|
792
945
|
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
793
946
|
) -> "mlrun.RunObject":
|
|
794
947
|
"""
|
|
@@ -796,7 +949,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
796
949
|
:py:meth:`~mlrun.model_monitoring.applications.ModelMonitoringApplicationBase.do_tracking`
|
|
797
950
|
model monitoring logic as a :py:class:`~mlrun.runtimes.KubejobRuntime`, which is an MLRun function.
|
|
798
951
|
|
|
799
|
-
This function has default values for all of its arguments. You should
|
|
952
|
+
This function has default values for all of its arguments. You should change them when you want to pass
|
|
800
953
|
data to the application.
|
|
801
954
|
|
|
802
955
|
:param func_path: The path to the function. If ``None``, the current notebook is used.
|
|
@@ -813,9 +966,13 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
813
966
|
:param reference_data: Pandas data-frame or :py:class:`~mlrun.artifacts.dataset.DatasetArtifact` URI as
|
|
814
967
|
the reference dataset.
|
|
815
968
|
When set, its statistics override the model endpoint's feature statistics.
|
|
969
|
+
You do not need to have a model endpoint to use this option.
|
|
816
970
|
:param image: Docker image to run the job on (when running remotely).
|
|
817
971
|
:param with_repo: Whether to clone the current repo to the build source.
|
|
818
|
-
:param class_handler: The relative path to the class, useful when using Git sources or code
|
|
972
|
+
:param class_handler: The relative path to the application class, useful when using Git sources or code
|
|
973
|
+
from images.
|
|
974
|
+
:param class_arguments: The arguments for the application class constructor. These are passed to the
|
|
975
|
+
class ``__init__``. The values must be JSON-serializable.
|
|
819
976
|
:param requirements: List of Python requirements to be installed in the image.
|
|
820
977
|
:param requirements_file: Path to a Python requirements file to be installed in the image.
|
|
821
978
|
:param endpoints: The model endpoints to get the data from. The options are:
|
|
@@ -833,8 +990,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
833
990
|
:param start: The start time of the endpoint's data, not included.
|
|
834
991
|
If you want the model endpoint's data at ``start`` included, you need to subtract a
|
|
835
992
|
small ``datetime.timedelta`` from it.
|
|
836
|
-
Make sure to include the time zone when constructing
|
|
837
|
-
manually.
|
|
993
|
+
Make sure to include the time zone when constructing ``datetime.datetime`` objects
|
|
994
|
+
manually. When both ``start`` and ``end`` times do not include a time zone, they will
|
|
995
|
+
be treated as UTC.
|
|
838
996
|
:param end: The end time of the endpoint's data, included.
|
|
839
997
|
Please note: when ``start`` and ``end`` are set, they create a left-open time interval
|
|
840
998
|
("window") :math:`(\\operatorname{start}, \\operatorname{end}]` that excludes the
|
|
@@ -856,11 +1014,18 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
856
1014
|
:param write_output: Whether to write the results and metrics to the time-series DB. Can be ``True`` only
|
|
857
1015
|
if ``endpoints`` are passed.
|
|
858
1016
|
Note: the model monitoring infrastructure must be up for the writing to work.
|
|
859
|
-
:param
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
1017
|
+
:param existing_data_handling:
|
|
1018
|
+
How to handle the existing application data for the model endpoints when writing
|
|
1019
|
+
new data whose requested ``start`` time precedes the ``end`` time of a previous run
|
|
1020
|
+
that also wrote to the database. Relevant only when ``write_output=True``.
|
|
1021
|
+
The options are:
|
|
1022
|
+
|
|
1023
|
+
- ``"fail_on_overlap"``: Default. An error is raised.
|
|
1024
|
+
- ``"skip_overlap"``: the overlapping data is ignored and the
|
|
1025
|
+
time window is cut so that it starts at the earliest possible time after ``start``.
|
|
1026
|
+
- ``"delete_all"``: delete all the data that was written by the application to the
|
|
1027
|
+
model endpoints, regardless of the time window, and write the new data.
|
|
1028
|
+
|
|
864
1029
|
:param stream_profile: The stream datastore profile. It should be provided only when running locally and
|
|
865
1030
|
writing the outputs to the database (i.e., when both ``run_local`` and
|
|
866
1031
|
``write_output`` are set to ``True``).
|
|
@@ -885,7 +1050,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
885
1050
|
project=project,
|
|
886
1051
|
)
|
|
887
1052
|
|
|
888
|
-
params: dict[
|
|
1053
|
+
params: dict[
|
|
1054
|
+
str, Union[list, dict, str, int, None, ds_profile.DatastoreProfile]
|
|
1055
|
+
] = {}
|
|
889
1056
|
if endpoints:
|
|
890
1057
|
params["endpoints"] = endpoints
|
|
891
1058
|
if sample_data is None:
|
|
@@ -899,18 +1066,6 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
899
1066
|
)
|
|
900
1067
|
params["end"] = end.isoformat() if isinstance(end, datetime) else end
|
|
901
1068
|
params["base_period"] = base_period
|
|
902
|
-
params["write_output"] = write_output
|
|
903
|
-
params["fail_on_overlap"] = fail_on_overlap
|
|
904
|
-
if stream_profile:
|
|
905
|
-
if not run_local:
|
|
906
|
-
raise mlrun.errors.MLRunValueError(
|
|
907
|
-
"Passing a `stream_profile` is relevant only when running locally"
|
|
908
|
-
)
|
|
909
|
-
if not write_output:
|
|
910
|
-
raise mlrun.errors.MLRunValueError(
|
|
911
|
-
"Passing a `stream_profile` is relevant only when writing the outputs"
|
|
912
|
-
)
|
|
913
|
-
params["stream_profile"] = stream_profile
|
|
914
1069
|
elif start or end or base_period:
|
|
915
1070
|
raise mlrun.errors.MLRunValueError(
|
|
916
1071
|
"Custom `start` and `end` times or base_period are supported only with endpoints data"
|
|
@@ -920,6 +1075,22 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
920
1075
|
"Writing the application output or passing `stream_profile` are supported only with endpoints data"
|
|
921
1076
|
)
|
|
922
1077
|
|
|
1078
|
+
params["write_output"] = write_output
|
|
1079
|
+
params["existing_data_handling"] = existing_data_handling
|
|
1080
|
+
if stream_profile:
|
|
1081
|
+
if not run_local:
|
|
1082
|
+
raise mlrun.errors.MLRunValueError(
|
|
1083
|
+
"Passing a `stream_profile` is relevant only when running locally"
|
|
1084
|
+
)
|
|
1085
|
+
if not write_output:
|
|
1086
|
+
raise mlrun.errors.MLRunValueError(
|
|
1087
|
+
"Passing a `stream_profile` is relevant only when writing the outputs"
|
|
1088
|
+
)
|
|
1089
|
+
params["stream_profile"] = stream_profile
|
|
1090
|
+
|
|
1091
|
+
if class_arguments:
|
|
1092
|
+
params["_init_args"] = class_arguments
|
|
1093
|
+
|
|
923
1094
|
inputs: dict[str, str] = {}
|
|
924
1095
|
for data, identifier in [
|
|
925
1096
|
(sample_data, "sample_data"),
|