mlrun 1.7.0rc5__py3-none-any.whl → 1.7.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/base.py +2 -1
- mlrun/artifacts/plots.py +9 -5
- mlrun/common/constants.py +6 -0
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -0
- mlrun/common/schemas/model_monitoring/constants.py +35 -18
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/types.py +7 -1
- mlrun/config.py +19 -6
- mlrun/data_types/data_types.py +4 -0
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +22 -16
- mlrun/datastore/datastore.py +4 -0
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/sources.py +7 -7
- mlrun/db/base.py +14 -6
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +61 -56
- mlrun/db/nopdb.py +3 -0
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +1 -1
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +1 -1
- mlrun/launcher/remote.py +1 -1
- mlrun/model.py +1 -0
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +104 -301
- mlrun/model_monitoring/application.py +21 -21
- mlrun/model_monitoring/applications/histogram_data_drift.py +130 -40
- mlrun/model_monitoring/controller.py +26 -33
- mlrun/model_monitoring/db/__init__.py +16 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -34
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +47 -6
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +49 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +76 -3
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +68 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/sqlite.py +13 -1
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +662 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +134 -3
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +45 -6
- mlrun/model_monitoring/stream_processing.py +43 -9
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +4 -36
- mlrun/projects/pipelines.py +13 -1
- mlrun/projects/project.py +279 -117
- mlrun/run.py +72 -74
- mlrun/runtimes/__init__.py +35 -0
- mlrun/runtimes/base.py +7 -1
- mlrun/runtimes/nuclio/api_gateway.py +188 -61
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +283 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +87 -0
- mlrun/runtimes/nuclio/function.py +53 -1
- mlrun/runtimes/nuclio/serving.py +28 -32
- mlrun/runtimes/pod.py +27 -1
- mlrun/serving/server.py +4 -6
- mlrun/serving/states.py +41 -33
- mlrun/utils/helpers.py +34 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/METADATA +14 -5
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/RECORD +71 -64
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/mysql.py +0 -34
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc5.dist-info → mlrun-1.7.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -16,17 +16,18 @@
|
|
|
16
16
|
import json
|
|
17
17
|
import os
|
|
18
18
|
import typing
|
|
19
|
+
from http import HTTPStatus
|
|
19
20
|
|
|
20
21
|
import v3io.dataplane
|
|
22
|
+
import v3io.dataplane.response
|
|
21
23
|
import v3io_frames
|
|
22
24
|
|
|
23
25
|
import mlrun.common.model_monitoring.helpers
|
|
24
26
|
import mlrun.common.schemas.model_monitoring
|
|
27
|
+
import mlrun.model_monitoring.db
|
|
25
28
|
import mlrun.utils.v3io_clients
|
|
26
29
|
from mlrun.utils import logger
|
|
27
30
|
|
|
28
|
-
from .model_endpoint_store import ModelEndpointStore
|
|
29
|
-
|
|
30
31
|
# Fields to encode before storing in the KV table or to decode after retrieving
|
|
31
32
|
fields_to_encode_decode = [
|
|
32
33
|
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_STATS,
|
|
@@ -34,7 +35,7 @@ fields_to_encode_decode = [
|
|
|
34
35
|
]
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
class
|
|
38
|
+
class KVStoreBase(mlrun.model_monitoring.db.StoreBase):
|
|
38
39
|
"""
|
|
39
40
|
Handles the DB operations when the DB target is from type KV. For the KV operations, we use an instance of V3IO
|
|
40
41
|
client and usually the KV table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/.
|
|
@@ -394,6 +395,128 @@ class KVModelEndpointStore(ModelEndpointStore):
|
|
|
394
395
|
|
|
395
396
|
return metrics_mapping
|
|
396
397
|
|
|
398
|
+
def write_application_result(self, event: dict[str, typing.Any]):
|
|
399
|
+
"""
|
|
400
|
+
Write a new application result event in the target table.
|
|
401
|
+
|
|
402
|
+
:param event: An event dictionary that represents the application result, should be corresponded to the
|
|
403
|
+
schema defined in the :py:class:`~mlrun.common.schemas.model_monitoring.constants.WriterEvent`
|
|
404
|
+
object.
|
|
405
|
+
"""
|
|
406
|
+
endpoint_id = event.pop(
|
|
407
|
+
mlrun.common.schemas.model_monitoring.WriterEvent.ENDPOINT_ID
|
|
408
|
+
)
|
|
409
|
+
app_name = event.pop(
|
|
410
|
+
mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME
|
|
411
|
+
)
|
|
412
|
+
metric_name = event.pop(
|
|
413
|
+
mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME
|
|
414
|
+
)
|
|
415
|
+
attributes = {metric_name: json.dumps(event)}
|
|
416
|
+
|
|
417
|
+
v3io_monitoring_apps_container = self.get_v3io_monitoring_apps_container(
|
|
418
|
+
project_name=self.project
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
self.client.kv.update(
|
|
422
|
+
container=v3io_monitoring_apps_container,
|
|
423
|
+
table_path=endpoint_id,
|
|
424
|
+
key=app_name,
|
|
425
|
+
attributes=attributes,
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
schema_file = self.client.kv.new_cursor(
|
|
429
|
+
container=v3io_monitoring_apps_container,
|
|
430
|
+
table_path=endpoint_id,
|
|
431
|
+
filter_expression='__name==".#schema"',
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if not schema_file.all():
|
|
435
|
+
logger.info(
|
|
436
|
+
"Generate a new V3IO KV schema file",
|
|
437
|
+
container=v3io_monitoring_apps_container,
|
|
438
|
+
endpoint_id=endpoint_id,
|
|
439
|
+
)
|
|
440
|
+
self._generate_kv_schema(endpoint_id, v3io_monitoring_apps_container)
|
|
441
|
+
logger.info("Updated V3IO KV successfully", key=app_name)
|
|
442
|
+
|
|
443
|
+
def _generate_kv_schema(
|
|
444
|
+
self, endpoint_id: str, v3io_monitoring_apps_container: str
|
|
445
|
+
):
|
|
446
|
+
"""Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
|
|
447
|
+
fields = [
|
|
448
|
+
{
|
|
449
|
+
"name": mlrun.common.schemas.model_monitoring.WriterEvent.RESULT_NAME,
|
|
450
|
+
"type": "string",
|
|
451
|
+
"nullable": False,
|
|
452
|
+
}
|
|
453
|
+
]
|
|
454
|
+
res = self.client.kv.create_schema(
|
|
455
|
+
container=v3io_monitoring_apps_container,
|
|
456
|
+
table_path=endpoint_id,
|
|
457
|
+
key=mlrun.common.schemas.model_monitoring.WriterEvent.APPLICATION_NAME,
|
|
458
|
+
fields=fields,
|
|
459
|
+
)
|
|
460
|
+
if res.status_code != HTTPStatus.OK:
|
|
461
|
+
raise mlrun.errors.MLRunBadRequestError(
|
|
462
|
+
f"Couldn't infer schema for endpoint {endpoint_id} which is required for Grafana dashboards"
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
logger.info(
|
|
466
|
+
"Generated V3IO KV schema successfully", endpoint_id=endpoint_id
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def get_last_analyzed(self, endpoint_id: str, application_name: str) -> int:
|
|
470
|
+
"""
|
|
471
|
+
Get the last analyzed time for the provided model endpoint and application.
|
|
472
|
+
|
|
473
|
+
:param endpoint_id: The unique id of the model endpoint.
|
|
474
|
+
:param application_name: Registered application name.
|
|
475
|
+
|
|
476
|
+
:return: Timestamp as a Unix time.
|
|
477
|
+
:raise: MLRunNotFoundError if last analyzed value is not found.
|
|
478
|
+
|
|
479
|
+
"""
|
|
480
|
+
try:
|
|
481
|
+
data = self.client.kv.get(
|
|
482
|
+
container=self._get_monitoring_schedules_container(
|
|
483
|
+
project_name=self.project
|
|
484
|
+
),
|
|
485
|
+
table_path=endpoint_id,
|
|
486
|
+
key=application_name,
|
|
487
|
+
)
|
|
488
|
+
return data.output.item[
|
|
489
|
+
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED
|
|
490
|
+
]
|
|
491
|
+
except v3io.dataplane.response.HttpResponseError as err:
|
|
492
|
+
logger.debug("Error while getting last analyzed time", err=err)
|
|
493
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
494
|
+
f"No last analyzed value has been found for {application_name} "
|
|
495
|
+
f"that processes model endpoint {endpoint_id}",
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
def update_last_analyzed(
|
|
499
|
+
self, endpoint_id: str, application_name: str, last_analyzed: int
|
|
500
|
+
):
|
|
501
|
+
"""
|
|
502
|
+
Update the last analyzed time for the provided model endpoint and application.
|
|
503
|
+
|
|
504
|
+
:param endpoint_id: The unique id of the model endpoint.
|
|
505
|
+
:param application_name: Registered application name.
|
|
506
|
+
:param last_analyzed: Timestamp as a Unix time that represents the last analyzed time of a certain
|
|
507
|
+
application and model endpoint.
|
|
508
|
+
"""
|
|
509
|
+
self.client.kv.put(
|
|
510
|
+
container=self._get_monitoring_schedules_container(
|
|
511
|
+
project_name=self.project
|
|
512
|
+
),
|
|
513
|
+
table_path=endpoint_id,
|
|
514
|
+
key=application_name,
|
|
515
|
+
attributes={
|
|
516
|
+
mlrun.common.schemas.model_monitoring.SchedulingKeys.LAST_ANALYZED: last_analyzed
|
|
517
|
+
},
|
|
518
|
+
)
|
|
519
|
+
|
|
397
520
|
def _generate_tsdb_paths(self) -> tuple[str, str]:
|
|
398
521
|
"""Generate a short path to the TSDB resources and a filtered path for the frames object
|
|
399
522
|
:return: A tuple of:
|
|
@@ -572,3 +695,11 @@ class KVModelEndpointStore(ModelEndpointStore):
|
|
|
572
695
|
if isinstance(field, bytes):
|
|
573
696
|
return field.decode()
|
|
574
697
|
return field
|
|
698
|
+
|
|
699
|
+
@staticmethod
|
|
700
|
+
def get_v3io_monitoring_apps_container(project_name: str) -> str:
|
|
701
|
+
return f"users/pipelines/{project_name}/monitoring-apps"
|
|
702
|
+
|
|
703
|
+
@staticmethod
|
|
704
|
+
def _get_monitoring_schedules_container(project_name: str) -> str:
|
|
705
|
+
return f"users/pipelines/{project_name}/monitoring-schedules/functions"
|
|
@@ -21,9 +21,34 @@ import plotly.graph_objects as go
|
|
|
21
21
|
from plotly.subplots import make_subplots
|
|
22
22
|
|
|
23
23
|
import mlrun.common.schemas.model_monitoring
|
|
24
|
+
from mlrun.artifacts import PlotlyArtifact
|
|
24
25
|
|
|
25
26
|
# A type for representing a drift result, a tuple of the status and the drift mean:
|
|
26
|
-
DriftResultType = tuple[
|
|
27
|
+
DriftResultType = tuple[
|
|
28
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp, float
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _PlotlyTableArtifact(PlotlyArtifact):
|
|
33
|
+
"""A custom class for plotly table artifacts"""
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _disable_table_dragging(figure_html: str) -> str:
|
|
37
|
+
"""
|
|
38
|
+
Disable the table columns dragging by adding the following
|
|
39
|
+
JavaScript code
|
|
40
|
+
"""
|
|
41
|
+
start, end = figure_html.rsplit(";", 1)
|
|
42
|
+
middle = (
|
|
43
|
+
';for (const element of document.getElementsByClassName("table")) '
|
|
44
|
+
'{element.style.pointerEvents = "none";}'
|
|
45
|
+
)
|
|
46
|
+
figure_html = start + middle + end
|
|
47
|
+
return figure_html
|
|
48
|
+
|
|
49
|
+
def get_body(self) -> str:
|
|
50
|
+
"""Get the adjusted HTML representation of the figure"""
|
|
51
|
+
return self._disable_table_dragging(super().get_body())
|
|
27
52
|
|
|
28
53
|
|
|
29
54
|
class FeaturesDriftTablePlot:
|
|
@@ -62,9 +87,9 @@ class FeaturesDriftTablePlot:
|
|
|
62
87
|
|
|
63
88
|
# Status configurations:
|
|
64
89
|
_STATUS_COLORS = {
|
|
65
|
-
mlrun.common.schemas.model_monitoring.
|
|
66
|
-
mlrun.common.schemas.model_monitoring.
|
|
67
|
-
mlrun.common.schemas.model_monitoring.
|
|
90
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.no_detection: "rgb(0,176,80)", # Green
|
|
91
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.potential_detection: "rgb(255,192,0)", # Orange
|
|
92
|
+
mlrun.common.schemas.model_monitoring.constants.ResultStatusApp.detected: "rgb(208,0,106)", # Magenta
|
|
68
93
|
}
|
|
69
94
|
|
|
70
95
|
# Font configurations:
|
|
@@ -97,7 +122,7 @@ class FeaturesDriftTablePlot:
|
|
|
97
122
|
inputs_statistics: dict,
|
|
98
123
|
metrics: dict[str, Union[dict, float]],
|
|
99
124
|
drift_results: dict[str, DriftResultType],
|
|
100
|
-
) ->
|
|
125
|
+
) -> _PlotlyTableArtifact:
|
|
101
126
|
"""
|
|
102
127
|
Produce the html code of the table plot with the given information and the stored configurations in the class.
|
|
103
128
|
|
|
@@ -106,9 +131,8 @@ class FeaturesDriftTablePlot:
|
|
|
106
131
|
:param metrics: The drift detection metrics calculated on the sample set and inputs.
|
|
107
132
|
:param drift_results: The drift results per feature according to the rules of the monitor.
|
|
108
133
|
|
|
109
|
-
:return: The
|
|
134
|
+
:return: The drift table as a plotly artifact.
|
|
110
135
|
"""
|
|
111
|
-
# Plot the drift table:
|
|
112
136
|
figure = self._plot(
|
|
113
137
|
features=list(inputs_statistics.keys()),
|
|
114
138
|
sample_set_statistics=sample_set_statistics,
|
|
@@ -116,19 +140,7 @@ class FeaturesDriftTablePlot:
|
|
|
116
140
|
metrics=metrics,
|
|
117
141
|
drift_results=drift_results,
|
|
118
142
|
)
|
|
119
|
-
|
|
120
|
-
# Get its HTML representation:
|
|
121
|
-
figure_html = figure.to_html()
|
|
122
|
-
|
|
123
|
-
# Turn off the table columns dragging by injecting the following JavaScript code:
|
|
124
|
-
start, end = figure_html.rsplit(";", 1)
|
|
125
|
-
middle = (
|
|
126
|
-
';for (const element of document.getElementsByClassName("table")) '
|
|
127
|
-
'{element.style.pointerEvents = "none";}'
|
|
128
|
-
)
|
|
129
|
-
figure_html = start + middle + end
|
|
130
|
-
|
|
131
|
-
return figure_html
|
|
143
|
+
return _PlotlyTableArtifact(figure=figure, key="drift_table_plot")
|
|
132
144
|
|
|
133
145
|
def _read_columns_names(self, statistics_dictionary: dict, drift_metrics: dict):
|
|
134
146
|
"""
|
|
@@ -366,10 +378,10 @@ class FeaturesDriftTablePlot:
|
|
|
366
378
|
bins = np.array(bins)
|
|
367
379
|
if bins[0] == -sys.float_info.max:
|
|
368
380
|
bins[0] = bins[1] - (bins[2] - bins[1])
|
|
369
|
-
hovertext[0] = f"(
|
|
381
|
+
hovertext[0] = f"(-inf, {bins[1]})"
|
|
370
382
|
if bins[-1] == sys.float_info.max:
|
|
371
383
|
bins[-1] = bins[-2] + (bins[-2] - bins[-3])
|
|
372
|
-
hovertext[-1] = f"({bins[-2]},
|
|
384
|
+
hovertext[-1] = f"({bins[-2]}, inf)"
|
|
373
385
|
# Center the bins (leave the first one):
|
|
374
386
|
bins = 0.5 * (bins[:-1] + bins[1:])
|
|
375
387
|
# Plot the histogram as a line with filled background below it:
|
|
@@ -15,6 +15,9 @@
|
|
|
15
15
|
import datetime
|
|
16
16
|
import typing
|
|
17
17
|
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
18
21
|
import mlrun
|
|
19
22
|
import mlrun.common.model_monitoring.helpers
|
|
20
23
|
import mlrun.common.schemas
|
|
@@ -36,10 +39,6 @@ class _BatchDict(typing.TypedDict):
|
|
|
36
39
|
days: int
|
|
37
40
|
|
|
38
41
|
|
|
39
|
-
class _MLRunNoRunsFoundError(Exception):
|
|
40
|
-
pass
|
|
41
|
-
|
|
42
|
-
|
|
43
42
|
def get_stream_path(
|
|
44
43
|
project: str = None,
|
|
45
44
|
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
@@ -55,8 +54,6 @@ def get_stream_path(
|
|
|
55
54
|
|
|
56
55
|
stream_uri = mlrun.get_secret_or_env(
|
|
57
56
|
mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
|
|
58
|
-
if function_name is mm_constants.MonitoringFunctionNames.STREAM
|
|
59
|
-
else ""
|
|
60
57
|
) or mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
61
58
|
project=project,
|
|
62
59
|
kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
|
|
@@ -212,3 +209,45 @@ def update_model_endpoint_last_request(
|
|
|
212
209
|
endpoint_id=model_endpoint.metadata.uid,
|
|
213
210
|
attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
214
211
|
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def calculate_inputs_statistics(
|
|
215
|
+
sample_set_statistics: dict, inputs: pd.DataFrame
|
|
216
|
+
) -> dict:
|
|
217
|
+
"""
|
|
218
|
+
Calculate the inputs data statistics for drift monitoring purpose.
|
|
219
|
+
|
|
220
|
+
:param sample_set_statistics: The sample set (stored end point's dataset to reference) statistics. The bins of the
|
|
221
|
+
histograms of each feature will be used to recalculate the histograms of the inputs.
|
|
222
|
+
:param inputs: The inputs to calculate their statistics and later on - the drift with respect to the
|
|
223
|
+
sample set.
|
|
224
|
+
|
|
225
|
+
:returns: The calculated statistics of the inputs data.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
229
|
+
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
230
|
+
df=inputs,
|
|
231
|
+
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
235
|
+
for feature in inputs_statistics.keys():
|
|
236
|
+
if feature in sample_set_statistics:
|
|
237
|
+
counts, bins = np.histogram(
|
|
238
|
+
inputs[feature].to_numpy(),
|
|
239
|
+
bins=sample_set_statistics[feature]["hist"][1],
|
|
240
|
+
)
|
|
241
|
+
inputs_statistics[feature]["hist"] = [
|
|
242
|
+
counts.tolist(),
|
|
243
|
+
bins.tolist(),
|
|
244
|
+
]
|
|
245
|
+
elif "hist" in inputs_statistics[feature]:
|
|
246
|
+
# Comply with the other common features' histogram length
|
|
247
|
+
mlrun.common.model_monitoring.helpers.pad_hist(
|
|
248
|
+
mlrun.common.model_monitoring.helpers.Histogram(
|
|
249
|
+
inputs_statistics[feature]["hist"]
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return inputs_statistics
|
|
@@ -24,7 +24,9 @@ import mlrun
|
|
|
24
24
|
import mlrun.common.model_monitoring.helpers
|
|
25
25
|
import mlrun.config
|
|
26
26
|
import mlrun.datastore.targets
|
|
27
|
+
import mlrun.feature_store as fstore
|
|
27
28
|
import mlrun.feature_store.steps
|
|
29
|
+
import mlrun.model_monitoring.db
|
|
28
30
|
import mlrun.model_monitoring.prometheus
|
|
29
31
|
import mlrun.serving.states
|
|
30
32
|
import mlrun.utils
|
|
@@ -36,6 +38,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
36
38
|
FileTargetKind,
|
|
37
39
|
ModelEndpointTarget,
|
|
38
40
|
ProjectSecretKeys,
|
|
41
|
+
PrometheusEndpoints,
|
|
39
42
|
)
|
|
40
43
|
from mlrun.utils import logger
|
|
41
44
|
|
|
@@ -183,11 +186,11 @@ class EventStreamProcessor:
|
|
|
183
186
|
# Step 2 - Filter out events with '-' in the path basename from going forward
|
|
184
187
|
# through the next steps of the stream graph
|
|
185
188
|
def apply_storey_filter_stream_events():
|
|
186
|
-
#
|
|
189
|
+
# Filter events with Prometheus endpoints path
|
|
187
190
|
graph.add_step(
|
|
188
191
|
"storey.Filter",
|
|
189
192
|
"filter_stream_event",
|
|
190
|
-
_fn="(
|
|
193
|
+
_fn=f"(event.path not in {PrometheusEndpoints.list()})",
|
|
191
194
|
full_event=True,
|
|
192
195
|
)
|
|
193
196
|
|
|
@@ -587,6 +590,8 @@ class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
|
587
590
|
for key in [
|
|
588
591
|
EventFieldType.FEATURES,
|
|
589
592
|
EventFieldType.NAMED_FEATURES,
|
|
593
|
+
EventFieldType.PREDICTION,
|
|
594
|
+
EventFieldType.NAMED_PREDICTIONS,
|
|
590
595
|
]:
|
|
591
596
|
event.pop(key, None)
|
|
592
597
|
|
|
@@ -931,6 +936,8 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
931
936
|
def do(self, event: dict):
|
|
932
937
|
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
933
938
|
|
|
939
|
+
feature_values = event[EventFieldType.FEATURES]
|
|
940
|
+
label_values = event[EventFieldType.PREDICTION]
|
|
934
941
|
# Get feature names and label columns
|
|
935
942
|
if endpoint_id not in self.feature_names:
|
|
936
943
|
endpoint_record = get_endpoint_record(
|
|
@@ -966,6 +973,12 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
966
973
|
},
|
|
967
974
|
)
|
|
968
975
|
|
|
976
|
+
update_monitoring_feature_set(
|
|
977
|
+
endpoint_record=endpoint_record,
|
|
978
|
+
feature_names=feature_names,
|
|
979
|
+
feature_values=feature_values,
|
|
980
|
+
)
|
|
981
|
+
|
|
969
982
|
# Similar process with label columns
|
|
970
983
|
if not label_columns and self._infer_columns_from_data:
|
|
971
984
|
label_columns = self._infer_label_columns_from_data(event)
|
|
@@ -984,6 +997,11 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
984
997
|
endpoint_id=endpoint_id,
|
|
985
998
|
attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
|
|
986
999
|
)
|
|
1000
|
+
update_monitoring_feature_set(
|
|
1001
|
+
endpoint_record=endpoint_record,
|
|
1002
|
+
feature_names=label_columns,
|
|
1003
|
+
feature_values=label_values,
|
|
1004
|
+
)
|
|
987
1005
|
|
|
988
1006
|
self.label_columns[endpoint_id] = label_columns
|
|
989
1007
|
self.feature_names[endpoint_id] = feature_names
|
|
@@ -1001,7 +1019,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
1001
1019
|
|
|
1002
1020
|
# Add feature_name:value pairs along with a mapping dictionary of all of these pairs
|
|
1003
1021
|
feature_names = self.feature_names[endpoint_id]
|
|
1004
|
-
feature_values = event[EventFieldType.FEATURES]
|
|
1005
1022
|
self._map_dictionary_values(
|
|
1006
1023
|
event=event,
|
|
1007
1024
|
named_iters=feature_names,
|
|
@@ -1011,7 +1028,6 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
1011
1028
|
|
|
1012
1029
|
# Add label_name:value pairs along with a mapping dictionary of all of these pairs
|
|
1013
1030
|
label_names = self.label_columns[endpoint_id]
|
|
1014
|
-
label_values = event[EventFieldType.PREDICTION]
|
|
1015
1031
|
self._map_dictionary_values(
|
|
1016
1032
|
event=event,
|
|
1017
1033
|
named_iters=label_names,
|
|
@@ -1137,10 +1153,10 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
|
1137
1153
|
self.project: str = project
|
|
1138
1154
|
|
|
1139
1155
|
def do(self, event):
|
|
1140
|
-
if event.path ==
|
|
1156
|
+
if event.path == PrometheusEndpoints.MODEL_MONITORING_METRICS:
|
|
1141
1157
|
# Return a parsed Prometheus registry file
|
|
1142
1158
|
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1143
|
-
elif event.path ==
|
|
1159
|
+
elif event.path == PrometheusEndpoints.MONITORING_BATCH_METRICS:
|
|
1144
1160
|
# Update statistical metrics
|
|
1145
1161
|
for event_metric in event.body:
|
|
1146
1162
|
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
@@ -1149,7 +1165,7 @@ class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
|
1149
1165
|
metric=event_metric[EventFieldType.METRIC],
|
|
1150
1166
|
value=event_metric[EventFieldType.VALUE],
|
|
1151
1167
|
)
|
|
1152
|
-
elif event.path ==
|
|
1168
|
+
elif event.path == PrometheusEndpoints.MONITORING_DRIFT_STATUS:
|
|
1153
1169
|
# Update drift status
|
|
1154
1170
|
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1155
1171
|
project=self.project,
|
|
@@ -1209,7 +1225,7 @@ def update_endpoint_record(
|
|
|
1209
1225
|
endpoint_id: str,
|
|
1210
1226
|
attributes: dict,
|
|
1211
1227
|
):
|
|
1212
|
-
model_endpoint_store = mlrun.model_monitoring.
|
|
1228
|
+
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
1213
1229
|
project=project,
|
|
1214
1230
|
)
|
|
1215
1231
|
|
|
@@ -1219,7 +1235,25 @@ def update_endpoint_record(
|
|
|
1219
1235
|
|
|
1220
1236
|
|
|
1221
1237
|
def get_endpoint_record(project: str, endpoint_id: str):
|
|
1222
|
-
model_endpoint_store = mlrun.model_monitoring.
|
|
1238
|
+
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
1223
1239
|
project=project,
|
|
1224
1240
|
)
|
|
1225
1241
|
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
def update_monitoring_feature_set(
|
|
1245
|
+
endpoint_record: dict[str, typing.Any],
|
|
1246
|
+
feature_names: list[str],
|
|
1247
|
+
feature_values: list[typing.Any],
|
|
1248
|
+
):
|
|
1249
|
+
monitoring_feature_set = fstore.get_feature_set(
|
|
1250
|
+
endpoint_record[
|
|
1251
|
+
mlrun.common.schemas.model_monitoring.EventFieldType.FEATURE_SET_URI
|
|
1252
|
+
]
|
|
1253
|
+
)
|
|
1254
|
+
for name, val in zip(feature_names, feature_values):
|
|
1255
|
+
monitoring_feature_set.add_feature(
|
|
1256
|
+
fstore.Feature(name=name, value_type=type(val))
|
|
1257
|
+
)
|
|
1258
|
+
|
|
1259
|
+
monitoring_feature_set.save()
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
15
|
+
import warnings
|
|
16
16
|
from typing import Union
|
|
17
17
|
|
|
18
18
|
import mlrun.common.schemas.schedule
|
|
@@ -55,6 +55,12 @@ class TrackingPolicy(mlrun.model.ModelObj):
|
|
|
55
55
|
writer function, which is a real time nuclio functino, will be deployed
|
|
56
56
|
with the same image. By default, the image is mlrun/mlrun.
|
|
57
57
|
"""
|
|
58
|
+
warnings.warn(
|
|
59
|
+
"The `TrackingPolicy` class is deprecated from version 1.7.0 and is not "
|
|
60
|
+
"used anymore. It will be removed in 1.9.0.",
|
|
61
|
+
FutureWarning,
|
|
62
|
+
)
|
|
63
|
+
|
|
58
64
|
if isinstance(default_batch_intervals, str):
|
|
59
65
|
default_batch_intervals = (
|
|
60
66
|
mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import datetime
|
|
16
16
|
import json
|
|
17
|
-
from http import HTTPStatus
|
|
18
17
|
from typing import Any, NewType
|
|
19
18
|
|
|
20
19
|
import pandas as pd
|
|
@@ -25,6 +24,7 @@ from v3io_frames.frames_pb2 import IGNORE
|
|
|
25
24
|
|
|
26
25
|
import mlrun.common.model_monitoring
|
|
27
26
|
import mlrun.model_monitoring
|
|
27
|
+
import mlrun.model_monitoring.db.stores
|
|
28
28
|
import mlrun.utils.v3io_clients
|
|
29
29
|
from mlrun.common.schemas.model_monitoring.constants import ResultStatusApp, WriterEvent
|
|
30
30
|
from mlrun.common.schemas.notification import NotificationKind, NotificationSeverity
|
|
@@ -106,13 +106,11 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
106
106
|
self.project = project
|
|
107
107
|
self.name = project # required for the deployment process
|
|
108
108
|
self._v3io_container = self.get_v3io_container(self.name)
|
|
109
|
-
self._kv_client = self._get_v3io_client().kv
|
|
110
109
|
self._tsdb_client = self._get_v3io_frames_client(self._v3io_container)
|
|
111
110
|
self._custom_notifier = CustomNotificationPusher(
|
|
112
111
|
notification_types=[NotificationKind.slack]
|
|
113
112
|
)
|
|
114
113
|
self._create_tsdb_table()
|
|
115
|
-
self._kv_schemas = []
|
|
116
114
|
|
|
117
115
|
@staticmethod
|
|
118
116
|
def get_v3io_container(project_name: str) -> str:
|
|
@@ -141,40 +139,10 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
141
139
|
|
|
142
140
|
def _update_kv_db(self, event: _AppResultEvent) -> None:
|
|
143
141
|
event = _AppResultEvent(event.copy())
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
metric_name = event.pop(WriterEvent.RESULT_NAME)
|
|
147
|
-
attributes = {metric_name: json.dumps(event)}
|
|
148
|
-
self._kv_client.update(
|
|
149
|
-
container=self._v3io_container,
|
|
150
|
-
table_path=endpoint_id,
|
|
151
|
-
key=app_name,
|
|
152
|
-
attributes=attributes,
|
|
142
|
+
application_result_store = mlrun.model_monitoring.get_store_object(
|
|
143
|
+
project=self.project
|
|
153
144
|
)
|
|
154
|
-
|
|
155
|
-
self._generate_kv_schema(endpoint_id)
|
|
156
|
-
logger.info("Updated V3IO KV successfully", key=app_name)
|
|
157
|
-
|
|
158
|
-
def _generate_kv_schema(self, endpoint_id: str):
|
|
159
|
-
"""Generate V3IO KV schema file which will be used by the model monitoring applications dashboard in Grafana."""
|
|
160
|
-
fields = [
|
|
161
|
-
{"name": WriterEvent.RESULT_NAME, "type": "string", "nullable": False}
|
|
162
|
-
]
|
|
163
|
-
res = self._kv_client.create_schema(
|
|
164
|
-
container=self._v3io_container,
|
|
165
|
-
table_path=endpoint_id,
|
|
166
|
-
key=WriterEvent.APPLICATION_NAME,
|
|
167
|
-
fields=fields,
|
|
168
|
-
)
|
|
169
|
-
if res.status_code != HTTPStatus.OK.value:
|
|
170
|
-
raise mlrun.errors.MLRunBadRequestError(
|
|
171
|
-
f"Couldn't infer schema for endpoint {endpoint_id} which is required for Grafana dashboards"
|
|
172
|
-
)
|
|
173
|
-
else:
|
|
174
|
-
logger.info(
|
|
175
|
-
"Generated V3IO KV schema successfully", endpoint_id=endpoint_id
|
|
176
|
-
)
|
|
177
|
-
self._kv_schemas.append(endpoint_id)
|
|
145
|
+
application_result_store.write_application_result(event=event)
|
|
178
146
|
|
|
179
147
|
def _update_tsdb(self, event: _AppResultEvent) -> None:
|
|
180
148
|
event = _AppResultEvent(event.copy())
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -412,6 +412,11 @@ def enrich_function_object(
|
|
|
412
412
|
if decorator:
|
|
413
413
|
decorator(f)
|
|
414
414
|
|
|
415
|
+
if project.spec.default_function_node_selector:
|
|
416
|
+
f.enrich_runtime_spec(
|
|
417
|
+
project.spec.default_function_node_selector,
|
|
418
|
+
)
|
|
419
|
+
|
|
415
420
|
if try_auto_mount:
|
|
416
421
|
if (
|
|
417
422
|
decorator and AutoMountType.is_auto_modifier(decorator)
|
|
@@ -608,6 +613,7 @@ class _KFPRunner(_PipelineRunner):
|
|
|
608
613
|
namespace=namespace,
|
|
609
614
|
artifact_path=artifact_path,
|
|
610
615
|
cleanup_ttl=workflow_spec.cleanup_ttl,
|
|
616
|
+
timeout=int(mlrun.mlconf.workflows.timeouts.kfp),
|
|
611
617
|
)
|
|
612
618
|
|
|
613
619
|
# The user provided workflow code might have made changes to function specs that require cleanup
|
|
@@ -865,15 +871,21 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
865
871
|
)
|
|
866
872
|
return
|
|
867
873
|
|
|
874
|
+
get_workflow_id_timeout = max(
|
|
875
|
+
int(mlrun.mlconf.workflows.timeouts.remote),
|
|
876
|
+
int(getattr(mlrun.mlconf.workflows.timeouts, inner_engine.engine)),
|
|
877
|
+
)
|
|
878
|
+
|
|
868
879
|
logger.debug(
|
|
869
880
|
"Workflow submitted, waiting for pipeline run to start",
|
|
870
881
|
workflow_name=workflow_response.name,
|
|
882
|
+
get_workflow_id_timeout=get_workflow_id_timeout,
|
|
871
883
|
)
|
|
872
884
|
|
|
873
885
|
# Getting workflow id from run:
|
|
874
886
|
response = retry_until_successful(
|
|
875
887
|
1,
|
|
876
|
-
|
|
888
|
+
get_workflow_id_timeout,
|
|
877
889
|
logger,
|
|
878
890
|
False,
|
|
879
891
|
run_db.get_workflow_id,
|