mlrun 1.8.0rc30__py3-none-any.whl → 1.8.0rc31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -35
- mlrun/api/schemas/__init__.py +1 -6
- mlrun/common/runtimes/constants.py +4 -0
- mlrun/common/schemas/__init__.py +0 -2
- mlrun/common/schemas/model_monitoring/__init__.py +0 -2
- mlrun/common/schemas/model_monitoring/constants.py +1 -6
- mlrun/common/schemas/model_monitoring/grafana.py +17 -11
- mlrun/config.py +9 -36
- mlrun/datastore/storeytargets.py +20 -3
- mlrun/model_monitoring/applications/base.py +55 -40
- mlrun/model_monitoring/applications/results.py +2 -2
- mlrun/model_monitoring/controller.py +4 -3
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -5
- mlrun/model_monitoring/db/tsdb/base.py +60 -39
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +117 -52
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +140 -14
- mlrun/model_monitoring/helpers.py +16 -15
- mlrun/model_monitoring/stream_processing.py +6 -13
- mlrun/projects/pipelines.py +11 -3
- mlrun/projects/project.py +84 -107
- mlrun/serving/states.py +1 -1
- mlrun/serving/v2_serving.py +20 -10
- mlrun/utils/helpers.py +1 -1
- mlrun/utils/logger.py +13 -10
- mlrun/utils/notifications/notification_pusher.py +24 -0
- mlrun/utils/regex.py +1 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/METADATA +2 -2
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/RECORD +33 -33
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc30.dist-info → mlrun-1.8.0rc31.dist-info}/top_level.txt +0 -0
|
@@ -11,10 +11,11 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import asyncio
|
|
15
|
+
import math
|
|
15
16
|
from datetime import datetime, timedelta, timezone
|
|
16
17
|
from io import StringIO
|
|
17
|
-
from typing import Literal, Optional, Union
|
|
18
|
+
from typing import Callable, Literal, Optional, Union
|
|
18
19
|
|
|
19
20
|
import pandas as pd
|
|
20
21
|
import v3io_frames
|
|
@@ -491,8 +492,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
491
492
|
interval: Optional[str] = None,
|
|
492
493
|
agg_funcs: Optional[list[str]] = None,
|
|
493
494
|
sliding_window_step: Optional[str] = None,
|
|
495
|
+
get_raw: bool = False,
|
|
494
496
|
**kwargs,
|
|
495
|
-
) -> pd.DataFrame:
|
|
497
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
496
498
|
"""
|
|
497
499
|
Getting records from V3IO TSDB data collection.
|
|
498
500
|
:param table: Path to the collection to query.
|
|
@@ -517,6 +519,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
517
519
|
`sliding_window_step` is provided, interval must be provided as well. Provided
|
|
518
520
|
as a string in the format of '1m', '1h', etc.
|
|
519
521
|
:param kwargs: Additional keyword arguments passed to the read method of frames client.
|
|
522
|
+
:param get_raw: Whether to return the request as raw frames rather than a pandas dataframe.
|
|
523
|
+
Defaults to False. This can greatly improve performance when a dataframe isn't
|
|
524
|
+
needed.
|
|
525
|
+
|
|
520
526
|
:return: DataFrame with the provided attributes from the data collection.
|
|
521
527
|
:raise: MLRunNotFoundError if the provided table wasn't found.
|
|
522
528
|
"""
|
|
@@ -530,7 +536,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
530
536
|
aggregators = ",".join(agg_funcs) if agg_funcs else None
|
|
531
537
|
table_path = self.tables[table]
|
|
532
538
|
try:
|
|
533
|
-
|
|
539
|
+
res = self.frames_client.read(
|
|
534
540
|
backend=_TSDB_BE,
|
|
535
541
|
table=table_path,
|
|
536
542
|
start=start,
|
|
@@ -540,15 +546,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
540
546
|
aggregation_window=interval,
|
|
541
547
|
aggregators=aggregators,
|
|
542
548
|
step=sliding_window_step,
|
|
549
|
+
get_raw=get_raw,
|
|
543
550
|
**kwargs,
|
|
544
551
|
)
|
|
552
|
+
if get_raw:
|
|
553
|
+
res = list(res)
|
|
545
554
|
except v3io_frames.Error as err:
|
|
546
555
|
if _is_no_schema_error(err):
|
|
547
|
-
return pd.DataFrame()
|
|
556
|
+
return [] if get_raw else pd.DataFrame()
|
|
548
557
|
else:
|
|
549
558
|
raise err
|
|
550
559
|
|
|
551
|
-
return
|
|
560
|
+
return res
|
|
552
561
|
|
|
553
562
|
def _get_v3io_source_directory(self) -> str:
|
|
554
563
|
"""
|
|
@@ -778,16 +787,23 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
778
787
|
endpoint_ids: Union[str, list[str]],
|
|
779
788
|
start: Optional[datetime] = None,
|
|
780
789
|
end: Optional[datetime] = None,
|
|
781
|
-
|
|
790
|
+
get_raw: bool = False,
|
|
791
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
782
792
|
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
783
793
|
start, end = self._get_start_end(start, end)
|
|
784
|
-
|
|
794
|
+
res = self._get_records(
|
|
785
795
|
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
786
796
|
start=start,
|
|
787
797
|
end=end,
|
|
788
798
|
filter_query=filter_query,
|
|
789
799
|
agg_funcs=["last"],
|
|
800
|
+
get_raw=get_raw,
|
|
790
801
|
)
|
|
802
|
+
|
|
803
|
+
if get_raw:
|
|
804
|
+
return res
|
|
805
|
+
|
|
806
|
+
df = res
|
|
791
807
|
if not df.empty:
|
|
792
808
|
df.rename(
|
|
793
809
|
columns={
|
|
@@ -811,11 +827,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
811
827
|
endpoint_ids: Union[str, list[str]],
|
|
812
828
|
start: Optional[datetime] = None,
|
|
813
829
|
end: Optional[datetime] = None,
|
|
814
|
-
|
|
830
|
+
get_raw: bool = False,
|
|
831
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
815
832
|
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
816
833
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
817
834
|
start, end = self._get_start_end(start, end)
|
|
818
|
-
|
|
835
|
+
res = self._get_records(
|
|
819
836
|
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
820
837
|
start=start,
|
|
821
838
|
end=end,
|
|
@@ -823,7 +840,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
823
840
|
filter_query=filter_query,
|
|
824
841
|
agg_funcs=["max"],
|
|
825
842
|
group_by="endpoint_id",
|
|
843
|
+
get_raw=get_raw,
|
|
826
844
|
)
|
|
845
|
+
if get_raw:
|
|
846
|
+
return res
|
|
847
|
+
|
|
848
|
+
df = res
|
|
827
849
|
if not df.empty:
|
|
828
850
|
df.columns = [
|
|
829
851
|
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
@@ -884,21 +906,28 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
884
906
|
endpoint_ids: Union[str, list[str]],
|
|
885
907
|
start: Optional[datetime] = None,
|
|
886
908
|
end: Optional[datetime] = None,
|
|
887
|
-
|
|
909
|
+
get_raw: bool = False,
|
|
910
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
888
911
|
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
889
912
|
if filter_query:
|
|
890
913
|
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
891
914
|
else:
|
|
892
915
|
filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
|
|
893
916
|
start, end = self._get_start_end(start, end)
|
|
894
|
-
|
|
917
|
+
res = self._get_records(
|
|
895
918
|
table=mm_schemas.FileTargetKind.ERRORS,
|
|
896
919
|
start=start,
|
|
897
920
|
end=end,
|
|
898
921
|
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
899
922
|
filter_query=filter_query,
|
|
900
923
|
agg_funcs=["count"],
|
|
924
|
+
get_raw=get_raw,
|
|
901
925
|
)
|
|
926
|
+
|
|
927
|
+
if get_raw:
|
|
928
|
+
return res
|
|
929
|
+
|
|
930
|
+
df = res
|
|
902
931
|
if not df.empty:
|
|
903
932
|
df.rename(
|
|
904
933
|
columns={
|
|
@@ -914,18 +943,25 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
914
943
|
endpoint_ids: Union[str, list[str]],
|
|
915
944
|
start: Optional[datetime] = None,
|
|
916
945
|
end: Optional[datetime] = None,
|
|
917
|
-
|
|
946
|
+
get_raw: bool = False,
|
|
947
|
+
) -> Union[pd.DataFrame, list[v3io_frames.client.RawFrame]]:
|
|
918
948
|
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
919
949
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
920
950
|
start, end = self._get_start_end(start, end)
|
|
921
|
-
|
|
951
|
+
res = self._get_records(
|
|
922
952
|
table=mm_schemas.V3IOTSDBTables.PREDICTIONS,
|
|
923
953
|
start=start,
|
|
924
954
|
end=end,
|
|
925
955
|
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
926
956
|
filter_query=filter_query,
|
|
927
957
|
agg_funcs=["avg"],
|
|
958
|
+
get_raw=get_raw,
|
|
928
959
|
)
|
|
960
|
+
|
|
961
|
+
if get_raw:
|
|
962
|
+
return res
|
|
963
|
+
|
|
964
|
+
df = res
|
|
929
965
|
if not df.empty:
|
|
930
966
|
df.dropna(inplace=True)
|
|
931
967
|
df.rename(
|
|
@@ -935,3 +971,93 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
935
971
|
inplace=True,
|
|
936
972
|
)
|
|
937
973
|
return df.reset_index(drop=True)
|
|
974
|
+
|
|
975
|
+
async def add_basic_metrics(
|
|
976
|
+
self,
|
|
977
|
+
model_endpoint_objects: list[mlrun.common.schemas.ModelEndpoint],
|
|
978
|
+
project: str,
|
|
979
|
+
run_in_threadpool: Callable,
|
|
980
|
+
) -> list[mlrun.common.schemas.ModelEndpoint]:
|
|
981
|
+
"""
|
|
982
|
+
Fetch basic metrics from V3IO TSDB and add them to MEP objects.
|
|
983
|
+
|
|
984
|
+
:param model_endpoint_objects: A list of `ModelEndpoint` objects that will
|
|
985
|
+
be filled with the relevant basic metrics.
|
|
986
|
+
:param project: The name of the project.
|
|
987
|
+
:param run_in_threadpool: A function that runs another function in a thread pool.
|
|
988
|
+
|
|
989
|
+
:return: A list of `ModelEndpointMonitoringMetric` objects.
|
|
990
|
+
"""
|
|
991
|
+
|
|
992
|
+
uids = []
|
|
993
|
+
model_endpoint_objects_by_uid = {}
|
|
994
|
+
for model_endpoint_object in model_endpoint_objects:
|
|
995
|
+
uid = model_endpoint_object.metadata.uid
|
|
996
|
+
uids.append(uid)
|
|
997
|
+
model_endpoint_objects_by_uid[uid] = model_endpoint_object
|
|
998
|
+
|
|
999
|
+
coroutines = [
|
|
1000
|
+
run_in_threadpool(
|
|
1001
|
+
self.get_error_count,
|
|
1002
|
+
endpoint_ids=uids,
|
|
1003
|
+
get_raw=True,
|
|
1004
|
+
),
|
|
1005
|
+
run_in_threadpool(
|
|
1006
|
+
self.get_last_request,
|
|
1007
|
+
endpoint_ids=uids,
|
|
1008
|
+
get_raw=True,
|
|
1009
|
+
),
|
|
1010
|
+
run_in_threadpool(
|
|
1011
|
+
self.get_avg_latency,
|
|
1012
|
+
endpoint_ids=uids,
|
|
1013
|
+
get_raw=True,
|
|
1014
|
+
),
|
|
1015
|
+
run_in_threadpool(
|
|
1016
|
+
self.get_drift_status,
|
|
1017
|
+
endpoint_ids=uids,
|
|
1018
|
+
get_raw=True,
|
|
1019
|
+
),
|
|
1020
|
+
]
|
|
1021
|
+
|
|
1022
|
+
(
|
|
1023
|
+
error_count_res,
|
|
1024
|
+
last_request_res,
|
|
1025
|
+
avg_latency_res,
|
|
1026
|
+
drift_status_res,
|
|
1027
|
+
) = await asyncio.gather(*coroutines)
|
|
1028
|
+
|
|
1029
|
+
def add_metric(
|
|
1030
|
+
metric: str,
|
|
1031
|
+
column_name: str,
|
|
1032
|
+
frames: list,
|
|
1033
|
+
):
|
|
1034
|
+
for frame in frames:
|
|
1035
|
+
endpoint_ids = frame.column_data("endpoint_id")
|
|
1036
|
+
metric_data = frame.column_data(column_name)
|
|
1037
|
+
for index, endpoint_id in enumerate(endpoint_ids):
|
|
1038
|
+
mep = model_endpoint_objects_by_uid.get(endpoint_id)
|
|
1039
|
+
value = metric_data[index]
|
|
1040
|
+
if mep and value is not None and not math.isnan(value):
|
|
1041
|
+
setattr(mep.status, metric, value)
|
|
1042
|
+
|
|
1043
|
+
add_metric(
|
|
1044
|
+
"error_count",
|
|
1045
|
+
"count(error_count)",
|
|
1046
|
+
error_count_res,
|
|
1047
|
+
)
|
|
1048
|
+
add_metric(
|
|
1049
|
+
"last_request",
|
|
1050
|
+
"last(last_request_timestamp)",
|
|
1051
|
+
last_request_res,
|
|
1052
|
+
)
|
|
1053
|
+
add_metric(
|
|
1054
|
+
"avg_latency",
|
|
1055
|
+
"max(result_status)",
|
|
1056
|
+
drift_status_res,
|
|
1057
|
+
)
|
|
1058
|
+
add_metric(
|
|
1059
|
+
"result_status",
|
|
1060
|
+
"avg(latency)",
|
|
1061
|
+
avg_latency_res,
|
|
1062
|
+
)
|
|
1063
|
+
return list(model_endpoint_objects_by_uid.values())
|
|
@@ -246,21 +246,6 @@ def get_monitoring_drift_measures_data(project: str, endpoint_id: str) -> "DataI
|
|
|
246
246
|
)
|
|
247
247
|
|
|
248
248
|
|
|
249
|
-
def get_tsdb_connection_string(
|
|
250
|
-
secret_provider: Optional[Callable[[str], str]] = None,
|
|
251
|
-
) -> str:
|
|
252
|
-
"""Get TSDB connection string from the project secret. If wasn't set, take it from the system
|
|
253
|
-
configurations.
|
|
254
|
-
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
255
|
-
:return: Valid TSDB connection string.
|
|
256
|
-
"""
|
|
257
|
-
|
|
258
|
-
return mlrun.get_secret_or_env(
|
|
259
|
-
key=mm_constants.ProjectSecretKeys.TSDB_CONNECTION,
|
|
260
|
-
secret_provider=secret_provider,
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
|
|
264
249
|
def _get_profile(
|
|
265
250
|
project: str,
|
|
266
251
|
secret_provider: Optional[Callable[[str], str]],
|
|
@@ -554,6 +539,22 @@ def get_result_instance_fqn(
|
|
|
554
539
|
return f"{model_endpoint_id}.{app_name}.result.{result_name}"
|
|
555
540
|
|
|
556
541
|
|
|
542
|
+
def get_alert_name_from_result_fqn(result_fqn: str):
|
|
543
|
+
"""
|
|
544
|
+
:param result_fqn: current get_result_instance_fqn format: `{model_endpoint_id}.{app_name}.result.{result_name}`
|
|
545
|
+
|
|
546
|
+
:return: shorter fqn without forbidden alert characters.
|
|
547
|
+
"""
|
|
548
|
+
if result_fqn.count(".") != 3 or result_fqn.split(".")[2] != "result":
|
|
549
|
+
raise mlrun.errors.MLRunValueError(
|
|
550
|
+
f"result_fqn: {result_fqn} is not in the correct format: {{model_endpoint_id}}.{{app_name}}."
|
|
551
|
+
f"result.{{result_name}}"
|
|
552
|
+
)
|
|
553
|
+
# Name format cannot contain "."
|
|
554
|
+
# The third component is always `result`, so it is not necessary for checking uniqueness.
|
|
555
|
+
return "_".join(result_fqn.split(".")[i] for i in [0, 1, 3])
|
|
556
|
+
|
|
557
|
+
|
|
557
558
|
def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
|
|
558
559
|
return get_result_instance_fqn(
|
|
559
560
|
model_endpoint_id,
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import datetime
|
|
16
|
-
import os
|
|
17
16
|
import typing
|
|
18
17
|
|
|
19
18
|
import storey
|
|
@@ -65,14 +64,11 @@ class EventStreamProcessor:
|
|
|
65
64
|
parquet_batching_max_events=self.parquet_batching_max_events,
|
|
66
65
|
)
|
|
67
66
|
|
|
68
|
-
self.storage_options = None
|
|
69
67
|
self.tsdb_configurations = {}
|
|
70
68
|
if not mlrun.mlconf.is_ce_mode():
|
|
71
69
|
self._initialize_v3io_configurations(
|
|
72
70
|
model_monitoring_access_key=model_monitoring_access_key
|
|
73
71
|
)
|
|
74
|
-
elif self.parquet_path.startswith("s3://"):
|
|
75
|
-
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
76
72
|
|
|
77
73
|
def _initialize_v3io_configurations(
|
|
78
74
|
self,
|
|
@@ -87,17 +83,12 @@ class EventStreamProcessor:
|
|
|
87
83
|
self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
|
|
88
84
|
self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
|
|
89
85
|
|
|
90
|
-
self.v3io_access_key = v3io_access_key or mlrun.
|
|
91
|
-
"V3IO_ACCESS_KEY"
|
|
92
|
-
)
|
|
86
|
+
self.v3io_access_key = v3io_access_key or mlrun.mlconf.get_v3io_access_key()
|
|
93
87
|
self.model_monitoring_access_key = (
|
|
94
88
|
model_monitoring_access_key
|
|
95
|
-
or
|
|
89
|
+
or mlrun.get_secret_or_env(ProjectSecretKeys.ACCESS_KEY)
|
|
96
90
|
or self.v3io_access_key
|
|
97
91
|
)
|
|
98
|
-
self.storage_options = dict(
|
|
99
|
-
v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
|
|
100
|
-
)
|
|
101
92
|
|
|
102
93
|
# TSDB path and configurations
|
|
103
94
|
tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
@@ -248,12 +239,12 @@ class EventStreamProcessor:
|
|
|
248
239
|
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
249
240
|
def apply_parquet_target():
|
|
250
241
|
graph.add_step(
|
|
251
|
-
"
|
|
242
|
+
"mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
243
|
+
alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
|
|
252
244
|
name="ParquetTarget",
|
|
253
245
|
after="ProcessBeforeParquet",
|
|
254
246
|
graph_shape="cylinder",
|
|
255
247
|
path=self.parquet_path,
|
|
256
|
-
storage_options=self.storage_options,
|
|
257
248
|
max_events=self.parquet_batching_max_events,
|
|
258
249
|
flush_after_seconds=self.parquet_batching_timeout_secs,
|
|
259
250
|
attributes={"infer_columns_from_data": True},
|
|
@@ -794,6 +785,8 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
794
785
|
|
|
795
786
|
"""
|
|
796
787
|
event[mapping_dictionary] = {}
|
|
788
|
+
diff = len(named_iters) - len(values_iters)
|
|
789
|
+
values_iters += [None] * diff
|
|
797
790
|
for name, value in zip(named_iters, values_iters):
|
|
798
791
|
event[name] = value
|
|
799
792
|
event[mapping_dictionary][name] = value
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -615,13 +615,21 @@ class _KFPRunner(_PipelineRunner):
|
|
|
615
615
|
"Notifications will only be sent if you wait for pipeline completion. "
|
|
616
616
|
"Some of the features (like setting message or severity level) are not supported."
|
|
617
617
|
)
|
|
618
|
-
# for start message, fallback to old notification behavior
|
|
619
618
|
for notification in notifications or []:
|
|
620
619
|
params = notification.params
|
|
621
620
|
params.update(notification.secret_params)
|
|
622
|
-
project.notifiers.add_notification(
|
|
621
|
+
project.notifiers.add_notification(
|
|
622
|
+
notification_type=notification.kind,
|
|
623
|
+
params=params,
|
|
624
|
+
name=notification.name,
|
|
625
|
+
message=notification.message,
|
|
626
|
+
severity=notification.severity,
|
|
627
|
+
when=notification.when,
|
|
628
|
+
condition=notification.condition,
|
|
629
|
+
secret_params=notification.secret_params,
|
|
630
|
+
)
|
|
623
631
|
|
|
624
|
-
project.spec.notifications =
|
|
632
|
+
project.spec.notifications = project.notifiers.server_notifications
|
|
625
633
|
|
|
626
634
|
run_id = _run_pipeline(
|
|
627
635
|
workflow_handler,
|
mlrun/projects/project.py
CHANGED
|
@@ -29,7 +29,6 @@ import zipfile
|
|
|
29
29
|
from copy import deepcopy
|
|
30
30
|
from os import environ, makedirs, path
|
|
31
31
|
from typing import Callable, Optional, Union, cast
|
|
32
|
-
from urllib.parse import urlparse
|
|
33
32
|
|
|
34
33
|
import deprecated
|
|
35
34
|
import dotenv
|
|
@@ -71,6 +70,7 @@ from mlrun.datastore.datastore_profile import (
|
|
|
71
70
|
from mlrun.datastore.vectorstore import VectorStoreCollection
|
|
72
71
|
from mlrun.model_monitoring.helpers import (
|
|
73
72
|
filter_results_by_regex,
|
|
73
|
+
get_alert_name_from_result_fqn,
|
|
74
74
|
get_result_instance_fqn,
|
|
75
75
|
)
|
|
76
76
|
from mlrun.runtimes.nuclio.function import RemoteRuntime
|
|
@@ -2142,7 +2142,8 @@ class MlrunProject(ModelObj):
|
|
|
2142
2142
|
reset_policy: mlrun.common.schemas.alert.ResetPolicy = mlrun.common.schemas.alert.ResetPolicy.AUTO,
|
|
2143
2143
|
) -> list[mlrun.alerts.alert.AlertConfig]:
|
|
2144
2144
|
"""
|
|
2145
|
-
:param name: AlertConfig name
|
|
2145
|
+
:param name: The name of the AlertConfig template. It will be combined with mep_id, app-name
|
|
2146
|
+
and result name to generate a unique name.
|
|
2146
2147
|
:param summary: Summary of the alert, will be sent in the generated notifications
|
|
2147
2148
|
:param endpoints: The endpoints from which metrics will be retrieved to configure the alerts.
|
|
2148
2149
|
This `ModelEndpointList` object obtained via the `list_model_endpoints`
|
|
@@ -2203,10 +2204,11 @@ class MlrunProject(ModelObj):
|
|
|
2203
2204
|
)
|
|
2204
2205
|
alert_result_names = list(set(specific_result_names + matching_results))
|
|
2205
2206
|
for result_fqn in alert_result_names:
|
|
2207
|
+
result_fqn_name = get_alert_name_from_result_fqn(result_fqn)
|
|
2206
2208
|
alerts.append(
|
|
2207
2209
|
mlrun.alerts.alert.AlertConfig(
|
|
2208
2210
|
project=self.name,
|
|
2209
|
-
name=name,
|
|
2211
|
+
name=f"{name}--{result_fqn_name}",
|
|
2210
2212
|
summary=summary,
|
|
2211
2213
|
severity=severity,
|
|
2212
2214
|
entities=alert_constants.EventEntities(
|
|
@@ -3671,50 +3673,77 @@ class MlrunProject(ModelObj):
|
|
|
3671
3673
|
|
|
3672
3674
|
def set_model_monitoring_credentials(
|
|
3673
3675
|
self,
|
|
3674
|
-
access_key: Optional[str] = None,
|
|
3675
|
-
stream_path: Optional[str] = None, # Deprecated
|
|
3676
|
-
tsdb_connection: Optional[str] = None, # Deprecated
|
|
3677
|
-
replace_creds: bool = False,
|
|
3678
3676
|
*,
|
|
3679
|
-
|
|
3680
|
-
|
|
3681
|
-
|
|
3677
|
+
tsdb_profile_name: str,
|
|
3678
|
+
stream_profile_name: str,
|
|
3679
|
+
replace_creds: bool = False,
|
|
3680
|
+
) -> None:
|
|
3682
3681
|
"""
|
|
3683
|
-
Set the credentials that will be used by the project's model monitoring
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
:param access_key: Model monitoring access key for managing user permissions.
|
|
3688
|
-
|
|
3689
|
-
* None - will be set from the system configuration.
|
|
3690
|
-
* v3io - for v3io endpoint store, pass `v3io` and the system will generate the
|
|
3691
|
-
exact path.
|
|
3692
|
-
:param stream_path: (Deprecated) This argument is deprecated. Use ``stream_profile_name`` instead.
|
|
3693
|
-
Path to the model monitoring stream. By default, None. Options:
|
|
3694
|
-
|
|
3695
|
-
* ``"v3io"`` - for v3io stream, pass ``"v3io"`` and the system will generate
|
|
3696
|
-
the exact path.
|
|
3697
|
-
* Kafka - for Kafka stream, provide the full connection string without acustom
|
|
3698
|
-
topic, for example ``"kafka://<some_kafka_broker>:<port>"``.
|
|
3699
|
-
:param tsdb_connection: (Deprecated) Connection string to the time series database. By default, None.
|
|
3700
|
-
Options:
|
|
3701
|
-
|
|
3702
|
-
* v3io - for v3io stream, pass ``"v3io"`` and the system will generate the
|
|
3703
|
-
exact path.
|
|
3704
|
-
* TDEngine - for TDEngine tsdb, provide the full websocket connection URL,
|
|
3705
|
-
for example ``"taosws://<username>:<password>@<host>:<port>"``.
|
|
3706
|
-
:param replace_creds: If True, will override the existing credentials.
|
|
3707
|
-
Please keep in mind that if you already enabled model monitoring on
|
|
3708
|
-
your project this action can cause data loose and will require redeploying
|
|
3709
|
-
all model monitoring functions & model monitoring infra
|
|
3710
|
-
& tracked model server.
|
|
3711
|
-
:param stream_profile_name: The datastore profile name of the stream to be used in model monitoring.
|
|
3712
|
-
The supported profiles are:
|
|
3682
|
+
Set the credentials that will be used by the project's model monitoring infrastructure functions.
|
|
3683
|
+
Please note that you have to set the credentials before deploying any model monitoring application
|
|
3684
|
+
or a tracked serving function.
|
|
3713
3685
|
|
|
3714
|
-
|
|
3715
|
-
|
|
3686
|
+
For example, the full flow for enabling model monitoring infrastructure with **TDEngine** and **Kafka**, is:
|
|
3687
|
+
|
|
3688
|
+
.. code-block:: python
|
|
3689
|
+
|
|
3690
|
+
import mlrun
|
|
3691
|
+
from mlrun.datastore.datastore_profile import (
|
|
3692
|
+
DatastoreProfileKafkaSource,
|
|
3693
|
+
TDEngineDatastoreProfile,
|
|
3694
|
+
)
|
|
3695
|
+
|
|
3696
|
+
project = mlrun.get_or_create_project("mm-infra-setup")
|
|
3697
|
+
|
|
3698
|
+
# Create and register TSDB profile
|
|
3699
|
+
tsdb_profile = TDEngineDatastoreProfile(
|
|
3700
|
+
name="my-tdengine",
|
|
3701
|
+
host="<tdengine-server-ip-address>",
|
|
3702
|
+
port=6041,
|
|
3703
|
+
user="username",
|
|
3704
|
+
password="<tdengine-password>",
|
|
3705
|
+
)
|
|
3706
|
+
project.register_datastore_profile(tsdb_profile)
|
|
3707
|
+
|
|
3708
|
+
# Create and register stream profile
|
|
3709
|
+
stream_profile = DatastoreProfileKafkaSource(
|
|
3710
|
+
name="my-kafka",
|
|
3711
|
+
brokers=["<kafka-broker-ip-address>:9094"],
|
|
3712
|
+
topics=[], # Keep the topics list empty
|
|
3713
|
+
## SASL is supported
|
|
3714
|
+
# sasl_user="user1",
|
|
3715
|
+
# sasl_pass="<kafka-sasl-password>",
|
|
3716
|
+
)
|
|
3717
|
+
project.register_datastore_profile(stream_profile)
|
|
3718
|
+
|
|
3719
|
+
# Set model monitoring credentials and enable the infrastructure
|
|
3720
|
+
project.set_model_monitoring_credentials(
|
|
3721
|
+
tsdb_profile_name=tsdb_profile.name,
|
|
3722
|
+
stream_profile_name=stream_profile.name,
|
|
3723
|
+
)
|
|
3724
|
+
project.enable_model_monitoring()
|
|
3725
|
+
|
|
3726
|
+
Note that you will need to change the profiles if you want to use **V3IO** TSDB and stream:
|
|
3727
|
+
|
|
3728
|
+
.. code-block:: python
|
|
3729
|
+
|
|
3730
|
+
from mlrun.datastore.datastore_profile import DatastoreProfileV3io
|
|
3731
|
+
|
|
3732
|
+
# Create and register TSDB profile
|
|
3733
|
+
tsdb_profile = DatastoreProfileV3io(
|
|
3734
|
+
name="my-v3io-tsdb",
|
|
3735
|
+
)
|
|
3736
|
+
project.register_datastore_profile(tsdb_profile)
|
|
3737
|
+
|
|
3738
|
+
# Create and register stream profile
|
|
3739
|
+
stream_profile = DatastoreProfileV3io(
|
|
3740
|
+
name="my-v3io-stream",
|
|
3741
|
+
v3io_access_key=mlrun.mlconf.get_v3io_access_key(),
|
|
3742
|
+
)
|
|
3743
|
+
project.register_datastore_profile(stream_profile)
|
|
3744
|
+
|
|
3745
|
+
In the V3IO datastore, you must provide an explicit access key to the stream, but not to the TSDB.
|
|
3716
3746
|
|
|
3717
|
-
You need to register one of them, and pass the profile's name.
|
|
3718
3747
|
:param tsdb_profile_name: The datastore profile name of the time-series database to be used in model
|
|
3719
3748
|
monitoring. The supported profiles are:
|
|
3720
3749
|
|
|
@@ -3722,76 +3751,24 @@ class MlrunProject(ModelObj):
|
|
|
3722
3751
|
* :py:class:`~mlrun.datastore.datastore_profile.TDEngineDatastoreProfile`
|
|
3723
3752
|
|
|
3724
3753
|
You need to register one of them, and pass the profile's name.
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3728
|
-
if tsdb_connection:
|
|
3729
|
-
warnings.warn(
|
|
3730
|
-
"The `tsdb_connection` argument is deprecated and will be removed in MLRun version 1.8.0. "
|
|
3731
|
-
"Use `tsdb_profile_name` instead.",
|
|
3732
|
-
FutureWarning,
|
|
3733
|
-
)
|
|
3734
|
-
if tsdb_profile_name:
|
|
3735
|
-
raise mlrun.errors.MLRunValueError(
|
|
3736
|
-
"If you set `tsdb_profile_name`, you must not pass `tsdb_connection`."
|
|
3737
|
-
)
|
|
3738
|
-
if tsdb_connection == "v3io":
|
|
3739
|
-
tsdb_profile = mlrun.datastore.datastore_profile.DatastoreProfileV3io(
|
|
3740
|
-
name=mm_constants.DefaultProfileName.TSDB
|
|
3741
|
-
)
|
|
3742
|
-
else:
|
|
3743
|
-
parsed_url = urlparse(tsdb_connection)
|
|
3744
|
-
if parsed_url.scheme != "taosws":
|
|
3745
|
-
raise mlrun.errors.MLRunValueError(
|
|
3746
|
-
f"Unsupported `tsdb_connection`: '{tsdb_connection}'."
|
|
3747
|
-
)
|
|
3748
|
-
tsdb_profile = (
|
|
3749
|
-
mlrun.datastore.datastore_profile.TDEngineDatastoreProfile(
|
|
3750
|
-
name=mm_constants.DefaultProfileName.TSDB,
|
|
3751
|
-
user=parsed_url.username,
|
|
3752
|
-
password=parsed_url.password,
|
|
3753
|
-
host=parsed_url.hostname,
|
|
3754
|
-
port=parsed_url.port,
|
|
3755
|
-
)
|
|
3756
|
-
)
|
|
3754
|
+
:param stream_profile_name: The datastore profile name of the stream to be used in model monitoring.
|
|
3755
|
+
The supported profiles are:
|
|
3757
3756
|
|
|
3758
|
-
|
|
3759
|
-
|
|
3757
|
+
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
|
|
3758
|
+
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource`
|
|
3760
3759
|
|
|
3761
|
-
|
|
3762
|
-
|
|
3763
|
-
|
|
3764
|
-
|
|
3765
|
-
|
|
3766
|
-
|
|
3767
|
-
|
|
3768
|
-
|
|
3769
|
-
"If you set `stream_profile_name`, you must not pass `stream_path`."
|
|
3770
|
-
)
|
|
3771
|
-
if stream_path == "v3io":
|
|
3772
|
-
stream_profile = mlrun.datastore.datastore_profile.DatastoreProfileV3io(
|
|
3773
|
-
name=mm_constants.DefaultProfileName.STREAM
|
|
3774
|
-
)
|
|
3775
|
-
else:
|
|
3776
|
-
parsed_stream = urlparse(stream_path)
|
|
3777
|
-
if parsed_stream.scheme != "kafka":
|
|
3778
|
-
raise mlrun.errors.MLRunValueError(
|
|
3779
|
-
f"Unsupported `stream_path`: '{stream_path}'."
|
|
3780
|
-
)
|
|
3781
|
-
stream_profile = (
|
|
3782
|
-
mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource(
|
|
3783
|
-
name=mm_constants.DefaultProfileName.STREAM,
|
|
3784
|
-
brokers=[parsed_stream.netloc],
|
|
3785
|
-
topics=[],
|
|
3786
|
-
)
|
|
3787
|
-
)
|
|
3788
|
-
self.register_datastore_profile(stream_profile)
|
|
3789
|
-
stream_profile_name = stream_profile.name
|
|
3760
|
+
You need to register one of them, and pass the profile's name.
|
|
3761
|
+
:param replace_creds: If ``True`` - override the existing credentials.
|
|
3762
|
+
Please keep in mind that if you have already enabled model monitoring
|
|
3763
|
+
on your project, replacing the credentials can cause data loss, and will
|
|
3764
|
+
require redeploying all the model monitoring functions, model monitoring
|
|
3765
|
+
infrastructure, and tracked model servers.
|
|
3766
|
+
"""
|
|
3767
|
+
db = mlrun.db.get_run_db(secrets=self._secrets)
|
|
3790
3768
|
|
|
3791
3769
|
db.set_model_monitoring_credentials(
|
|
3792
3770
|
project=self.name,
|
|
3793
3771
|
credentials={
|
|
3794
|
-
"access_key": access_key,
|
|
3795
3772
|
"tsdb_profile_name": tsdb_profile_name,
|
|
3796
3773
|
"stream_profile_name": stream_profile_name,
|
|
3797
3774
|
},
|
mlrun/serving/states.py
CHANGED
|
@@ -812,8 +812,8 @@ class RouterStep(TaskStep):
|
|
|
812
812
|
* **archive**:
|
|
813
813
|
1. If model endpoints with the same name exist, preserve them.
|
|
814
814
|
2. Create a new model endpoint with the same name and set it to `latest`.
|
|
815
|
-
|
|
816
815
|
"""
|
|
816
|
+
|
|
817
817
|
if len(self.routes.keys()) >= MAX_MODELS_PER_ROUTER and key not in self.routes:
|
|
818
818
|
raise mlrun.errors.MLRunModelLimitExceededError(
|
|
819
819
|
f"Router cannot support more than {MAX_MODELS_PER_ROUTER} model endpoints. "
|