mlrun 1.8.0rc18__py3-none-any.whl → 1.8.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +5 -0
- mlrun/common/runtimes/constants.py +17 -0
- mlrun/common/schemas/artifact.py +6 -0
- mlrun/common/schemas/model_monitoring/__init__.py +1 -0
- mlrun/common/schemas/model_monitoring/constants.py +16 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +4 -2
- mlrun/config.py +2 -2
- mlrun/db/base.py +18 -0
- mlrun/db/httpdb.py +118 -1
- mlrun/db/nopdb.py +9 -0
- mlrun/frameworks/_common/model_handler.py +0 -2
- mlrun/model_monitoring/db/tsdb/base.py +116 -8
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +37 -29
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +46 -26
- mlrun/model_monitoring/helpers.py +2 -2
- mlrun/model_monitoring/stream_processing.py +21 -0
- mlrun/projects/pipelines.py +16 -3
- mlrun/projects/project.py +45 -8
- mlrun/runtimes/nuclio/serving.py +20 -11
- mlrun/serving/v2_serving.py +51 -36
- mlrun/utils/helpers.py +163 -1
- mlrun/utils/notifications/notification/webhook.py +3 -0
- mlrun/utils/notifications/notification_pusher.py +59 -165
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/METADATA +1 -1
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/RECORD +31 -31
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import typing
|
|
16
|
-
from datetime import datetime, timedelta
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
17
|
|
|
18
18
|
import pandas as pd
|
|
19
19
|
import taosws
|
|
@@ -164,6 +164,17 @@ class TDEngineConnector(TSDBConnector):
|
|
|
164
164
|
def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
|
|
165
165
|
return datetime.fromisoformat(val) if isinstance(val, str) else val
|
|
166
166
|
|
|
167
|
+
@staticmethod
|
|
168
|
+
def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]) -> str:
|
|
169
|
+
if isinstance(endpoint_id, str):
|
|
170
|
+
return f"endpoint_id='{endpoint_id}'"
|
|
171
|
+
elif isinstance(endpoint_id, list):
|
|
172
|
+
return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
|
|
173
|
+
else:
|
|
174
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
175
|
+
"Invalid 'endpoint_id' filter: must be a string or a list."
|
|
176
|
+
)
|
|
177
|
+
|
|
167
178
|
def apply_monitoring_stream_steps(self, graph, **kwarg):
|
|
168
179
|
"""
|
|
169
180
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -195,6 +206,8 @@ class TDEngineConnector(TSDBConnector):
|
|
|
195
206
|
columns=[
|
|
196
207
|
mm_schemas.EventFieldType.LATENCY,
|
|
197
208
|
mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
|
|
209
|
+
mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
|
|
210
|
+
mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
|
|
198
211
|
],
|
|
199
212
|
tag_cols=[
|
|
200
213
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
@@ -472,7 +485,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
472
485
|
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
473
486
|
start=start,
|
|
474
487
|
end=end,
|
|
475
|
-
columns=[mm_schemas.EventFieldType.
|
|
488
|
+
columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
|
|
476
489
|
filter_query=f"endpoint_id='{endpoint_id}'",
|
|
477
490
|
agg_funcs=agg_funcs,
|
|
478
491
|
interval=aggregation_window,
|
|
@@ -492,10 +505,10 @@ class TDEngineConnector(TSDBConnector):
|
|
|
492
505
|
df["_wend"] = pd.to_datetime(df["_wend"])
|
|
493
506
|
df.set_index("_wend", inplace=True)
|
|
494
507
|
|
|
495
|
-
|
|
496
|
-
f"{agg_funcs[0]}({mm_schemas.EventFieldType.
|
|
508
|
+
estimated_prediction_count = (
|
|
509
|
+
f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
|
|
497
510
|
if agg_funcs
|
|
498
|
-
else mm_schemas.EventFieldType.
|
|
511
|
+
else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
|
|
499
512
|
)
|
|
500
513
|
|
|
501
514
|
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
@@ -503,7 +516,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
503
516
|
values=list(
|
|
504
517
|
zip(
|
|
505
518
|
df.index,
|
|
506
|
-
df[
|
|
519
|
+
df[estimated_prediction_count],
|
|
507
520
|
)
|
|
508
521
|
), # pyright: ignore[reportArgumentType]
|
|
509
522
|
)
|
|
@@ -514,9 +527,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
514
527
|
start: typing.Optional[datetime] = None,
|
|
515
528
|
end: typing.Optional[datetime] = None,
|
|
516
529
|
) -> pd.DataFrame:
|
|
517
|
-
|
|
518
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
519
|
-
)
|
|
530
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
520
531
|
start, end = self._get_start_end(start, end)
|
|
521
532
|
df = self._get_records(
|
|
522
533
|
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
@@ -527,7 +538,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
527
538
|
mm_schemas.EventFieldType.TIME,
|
|
528
539
|
mm_schemas.EventFieldType.LATENCY,
|
|
529
540
|
],
|
|
530
|
-
filter_query=
|
|
541
|
+
filter_query=filter_query,
|
|
531
542
|
timestamp_column=mm_schemas.EventFieldType.TIME,
|
|
532
543
|
agg_funcs=["last"],
|
|
533
544
|
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
@@ -542,12 +553,11 @@ class TDEngineConnector(TSDBConnector):
|
|
|
542
553
|
},
|
|
543
554
|
inplace=True,
|
|
544
555
|
)
|
|
545
|
-
df[mm_schemas.EventFieldType.LAST_REQUEST] =
|
|
546
|
-
mm_schemas.EventFieldType.LAST_REQUEST
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
).astimezone(tz=timezone.utc)
|
|
556
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
|
|
557
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST],
|
|
558
|
+
errors="coerce",
|
|
559
|
+
format="ISO8601",
|
|
560
|
+
utc=True,
|
|
551
561
|
)
|
|
552
562
|
return df
|
|
553
563
|
|
|
@@ -557,9 +567,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
557
567
|
start: typing.Optional[datetime] = None,
|
|
558
568
|
end: typing.Optional[datetime] = None,
|
|
559
569
|
) -> pd.DataFrame:
|
|
560
|
-
|
|
561
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
562
|
-
)
|
|
570
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
563
571
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
564
572
|
start, end = self._get_start_end(start, end)
|
|
565
573
|
df = self._get_records(
|
|
@@ -570,7 +578,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
570
578
|
mm_schemas.ResultData.RESULT_STATUS,
|
|
571
579
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
572
580
|
],
|
|
573
|
-
filter_query=
|
|
581
|
+
filter_query=filter_query,
|
|
574
582
|
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
575
583
|
agg_funcs=["max"],
|
|
576
584
|
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
@@ -588,7 +596,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
588
596
|
|
|
589
597
|
def get_metrics_metadata(
|
|
590
598
|
self,
|
|
591
|
-
endpoint_id: str,
|
|
599
|
+
endpoint_id: typing.Union[str, list[str]],
|
|
592
600
|
start: typing.Optional[datetime] = None,
|
|
593
601
|
end: typing.Optional[datetime] = None,
|
|
594
602
|
) -> pd.DataFrame:
|
|
@@ -602,11 +610,12 @@ class TDEngineConnector(TSDBConnector):
|
|
|
602
610
|
mm_schemas.MetricData.METRIC_NAME,
|
|
603
611
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
604
612
|
],
|
|
605
|
-
filter_query=
|
|
613
|
+
filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
|
|
606
614
|
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
607
615
|
group_by=[
|
|
608
616
|
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
609
617
|
mm_schemas.MetricData.METRIC_NAME,
|
|
618
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
610
619
|
],
|
|
611
620
|
agg_funcs=["last"],
|
|
612
621
|
)
|
|
@@ -624,7 +633,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
624
633
|
|
|
625
634
|
def get_results_metadata(
|
|
626
635
|
self,
|
|
627
|
-
endpoint_id: str,
|
|
636
|
+
endpoint_id: typing.Union[str, list[str]],
|
|
628
637
|
start: typing.Optional[datetime] = None,
|
|
629
638
|
end: typing.Optional[datetime] = None,
|
|
630
639
|
) -> pd.DataFrame:
|
|
@@ -639,11 +648,12 @@ class TDEngineConnector(TSDBConnector):
|
|
|
639
648
|
mm_schemas.ResultData.RESULT_KIND,
|
|
640
649
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
641
650
|
],
|
|
642
|
-
filter_query=
|
|
651
|
+
filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
|
|
643
652
|
timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
|
|
644
653
|
group_by=[
|
|
645
654
|
mm_schemas.WriterEvent.APPLICATION_NAME,
|
|
646
655
|
mm_schemas.ResultData.RESULT_NAME,
|
|
656
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
647
657
|
],
|
|
648
658
|
agg_funcs=["last"],
|
|
649
659
|
)
|
|
@@ -666,9 +676,8 @@ class TDEngineConnector(TSDBConnector):
|
|
|
666
676
|
start: typing.Optional[datetime] = None,
|
|
667
677
|
end: typing.Optional[datetime] = None,
|
|
668
678
|
) -> pd.DataFrame:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
)
|
|
679
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
680
|
+
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
672
681
|
start, end = self._get_start_end(start, end)
|
|
673
682
|
df = self._get_records(
|
|
674
683
|
table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
|
|
@@ -679,8 +688,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
679
688
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
680
689
|
],
|
|
681
690
|
agg_funcs=["count"],
|
|
682
|
-
filter_query=
|
|
683
|
-
f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'",
|
|
691
|
+
filter_query=filter_query,
|
|
684
692
|
group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
685
693
|
preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
|
|
686
694
|
)
|
|
@@ -33,6 +33,8 @@ _TSDB_BE = "tsdb"
|
|
|
33
33
|
_TSDB_RATE = "1/s"
|
|
34
34
|
_CONTAINER = "users"
|
|
35
35
|
|
|
36
|
+
V3IO_MEPS_LIMIT = 200
|
|
37
|
+
|
|
36
38
|
|
|
37
39
|
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
38
40
|
"""
|
|
@@ -232,6 +234,8 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
232
234
|
columns=[
|
|
233
235
|
mm_schemas.EventFieldType.LATENCY,
|
|
234
236
|
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
237
|
+
mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
|
|
238
|
+
mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
|
|
235
239
|
],
|
|
236
240
|
index_cols=[
|
|
237
241
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
@@ -577,6 +581,25 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
577
581
|
token=v3io_access_key,
|
|
578
582
|
)
|
|
579
583
|
|
|
584
|
+
@staticmethod
|
|
585
|
+
def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
|
|
586
|
+
if isinstance(endpoint_id, str):
|
|
587
|
+
return f"endpoint_id=='{endpoint_id}'"
|
|
588
|
+
elif isinstance(endpoint_id, list):
|
|
589
|
+
if len(endpoint_id) > V3IO_MEPS_LIMIT:
|
|
590
|
+
logger.info(
|
|
591
|
+
"The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
|
|
592
|
+
"retrieving all the model endpoints from the db.",
|
|
593
|
+
limit=V3IO_MEPS_LIMIT,
|
|
594
|
+
amount=len(endpoint_id),
|
|
595
|
+
)
|
|
596
|
+
return None
|
|
597
|
+
return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
|
|
598
|
+
else:
|
|
599
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
600
|
+
f"Invalid 'endpoint_id' filter: must be a string or a list, endpoint_id: {endpoint_id}"
|
|
601
|
+
)
|
|
602
|
+
|
|
580
603
|
def read_metrics_data(
|
|
581
604
|
self,
|
|
582
605
|
*,
|
|
@@ -720,7 +743,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
720
743
|
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
721
744
|
start=start,
|
|
722
745
|
end=end,
|
|
723
|
-
columns=[mm_schemas.EventFieldType.
|
|
746
|
+
columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
|
|
724
747
|
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
725
748
|
agg_funcs=agg_funcs,
|
|
726
749
|
sliding_window_step=aggregation_window,
|
|
@@ -734,10 +757,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
734
757
|
type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
|
|
735
758
|
)
|
|
736
759
|
|
|
737
|
-
|
|
738
|
-
f"{agg_funcs[0]}({mm_schemas.EventFieldType.
|
|
760
|
+
estimated_prediction_count = (
|
|
761
|
+
f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
|
|
739
762
|
if agg_funcs
|
|
740
|
-
else mm_schemas.EventFieldType.
|
|
763
|
+
else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
|
|
741
764
|
)
|
|
742
765
|
|
|
743
766
|
return mm_schemas.ModelEndpointMonitoringMetricValues(
|
|
@@ -745,7 +768,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
745
768
|
values=list(
|
|
746
769
|
zip(
|
|
747
770
|
df.index,
|
|
748
|
-
df[
|
|
771
|
+
df[estimated_prediction_count],
|
|
749
772
|
)
|
|
750
773
|
), # pyright: ignore[reportArgumentType]
|
|
751
774
|
)
|
|
@@ -756,15 +779,13 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
756
779
|
start: Optional[datetime] = None,
|
|
757
780
|
end: Optional[datetime] = None,
|
|
758
781
|
) -> pd.DataFrame:
|
|
759
|
-
|
|
760
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
761
|
-
)
|
|
782
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
762
783
|
start, end = self._get_start_end(start, end)
|
|
763
784
|
df = self._get_records(
|
|
764
785
|
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
765
786
|
start=start,
|
|
766
787
|
end=end,
|
|
767
|
-
filter_query=
|
|
788
|
+
filter_query=filter_query,
|
|
768
789
|
agg_funcs=["last"],
|
|
769
790
|
)
|
|
770
791
|
if not df.empty:
|
|
@@ -791,9 +812,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
791
812
|
start: Optional[datetime] = None,
|
|
792
813
|
end: Optional[datetime] = None,
|
|
793
814
|
) -> pd.DataFrame:
|
|
794
|
-
|
|
795
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
796
|
-
)
|
|
815
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
797
816
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
798
817
|
start, end = self._get_start_end(start, end)
|
|
799
818
|
df = self._get_records(
|
|
@@ -801,7 +820,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
801
820
|
start=start,
|
|
802
821
|
end=end,
|
|
803
822
|
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
804
|
-
filter_query=
|
|
823
|
+
filter_query=filter_query,
|
|
805
824
|
agg_funcs=["max"],
|
|
806
825
|
group_by="endpoint_id",
|
|
807
826
|
)
|
|
@@ -813,17 +832,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
813
832
|
|
|
814
833
|
def get_metrics_metadata(
|
|
815
834
|
self,
|
|
816
|
-
endpoint_id: str,
|
|
835
|
+
endpoint_id: Union[str, list[str]],
|
|
817
836
|
start: Optional[datetime] = None,
|
|
818
837
|
end: Optional[datetime] = None,
|
|
819
838
|
) -> pd.DataFrame:
|
|
820
839
|
start, end = self._get_start_end(start, end)
|
|
840
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
|
|
821
841
|
df = self._get_records(
|
|
822
842
|
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
823
843
|
start=start,
|
|
824
844
|
end=end,
|
|
825
845
|
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
826
|
-
filter_query=
|
|
846
|
+
filter_query=filter_query,
|
|
827
847
|
agg_funcs=["last"],
|
|
828
848
|
)
|
|
829
849
|
if not df.empty:
|
|
@@ -834,11 +854,12 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
834
854
|
|
|
835
855
|
def get_results_metadata(
|
|
836
856
|
self,
|
|
837
|
-
endpoint_id: str,
|
|
857
|
+
endpoint_id: Union[str, list[str]],
|
|
838
858
|
start: Optional[datetime] = None,
|
|
839
859
|
end: Optional[datetime] = None,
|
|
840
860
|
) -> pd.DataFrame:
|
|
841
861
|
start, end = self._get_start_end(start, end)
|
|
862
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
|
|
842
863
|
df = self._get_records(
|
|
843
864
|
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
844
865
|
start=start,
|
|
@@ -846,7 +867,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
846
867
|
columns=[
|
|
847
868
|
mm_schemas.ResultData.RESULT_KIND,
|
|
848
869
|
],
|
|
849
|
-
filter_query=
|
|
870
|
+
filter_query=filter_query,
|
|
850
871
|
agg_funcs=["last"],
|
|
851
872
|
)
|
|
852
873
|
if not df.empty:
|
|
@@ -864,17 +885,18 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
864
885
|
start: Optional[datetime] = None,
|
|
865
886
|
end: Optional[datetime] = None,
|
|
866
887
|
) -> pd.DataFrame:
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
888
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
889
|
+
if filter_query:
|
|
890
|
+
filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
|
|
891
|
+
else:
|
|
892
|
+
filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
|
|
870
893
|
start, end = self._get_start_end(start, end)
|
|
871
894
|
df = self._get_records(
|
|
872
895
|
table=mm_schemas.FileTargetKind.ERRORS,
|
|
873
896
|
start=start,
|
|
874
897
|
end=end,
|
|
875
898
|
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
876
|
-
filter_query=
|
|
877
|
-
f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'",
|
|
899
|
+
filter_query=filter_query,
|
|
878
900
|
agg_funcs=["count"],
|
|
879
901
|
)
|
|
880
902
|
if not df.empty:
|
|
@@ -893,9 +915,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
893
915
|
start: Optional[datetime] = None,
|
|
894
916
|
end: Optional[datetime] = None,
|
|
895
917
|
) -> pd.DataFrame:
|
|
896
|
-
|
|
897
|
-
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
898
|
-
)
|
|
918
|
+
filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
|
|
899
919
|
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
900
920
|
start, end = self._get_start_end(start, end)
|
|
901
921
|
df = self._get_records(
|
|
@@ -903,7 +923,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
903
923
|
start=start,
|
|
904
924
|
end=end,
|
|
905
925
|
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
906
|
-
filter_query=
|
|
926
|
+
filter_query=filter_query,
|
|
907
927
|
agg_funcs=["avg"],
|
|
908
928
|
)
|
|
909
929
|
if not df.empty:
|
|
@@ -32,7 +32,7 @@ import mlrun.utils.helpers
|
|
|
32
32
|
from mlrun.common.schemas import ModelEndpoint
|
|
33
33
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
34
34
|
ModelEndpointMonitoringMetric,
|
|
35
|
-
|
|
35
|
+
compose_full_name,
|
|
36
36
|
)
|
|
37
37
|
from mlrun.utils import logger
|
|
38
38
|
|
|
@@ -450,7 +450,7 @@ def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
|
|
|
450
450
|
|
|
451
451
|
|
|
452
452
|
def get_invocations_fqn(project: str) -> str:
|
|
453
|
-
return
|
|
453
|
+
return compose_full_name(
|
|
454
454
|
project=project,
|
|
455
455
|
app=mm_constants.SpecialApps.MLRUN_INFRA,
|
|
456
456
|
name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
|
|
@@ -430,6 +430,10 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
430
430
|
if not isinstance(feature, list):
|
|
431
431
|
feature = [feature]
|
|
432
432
|
|
|
433
|
+
effective_sample_count, estimated_prediction_count = (
|
|
434
|
+
self._get_effective_and_estimated_counts(event=event)
|
|
435
|
+
)
|
|
436
|
+
|
|
433
437
|
events.append(
|
|
434
438
|
{
|
|
435
439
|
EventFieldType.FUNCTION_URI: function_uri,
|
|
@@ -453,6 +457,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
453
457
|
EventFieldType.ENTITIES: event.get("request", {}).get(
|
|
454
458
|
EventFieldType.ENTITIES, {}
|
|
455
459
|
),
|
|
460
|
+
EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
|
|
461
|
+
EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
|
|
456
462
|
}
|
|
457
463
|
)
|
|
458
464
|
|
|
@@ -507,6 +513,20 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
507
513
|
self.error_count[endpoint_id] += 1
|
|
508
514
|
return False
|
|
509
515
|
|
|
516
|
+
@staticmethod
|
|
517
|
+
def _get_effective_and_estimated_counts(event):
|
|
518
|
+
"""
|
|
519
|
+
Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
|
|
520
|
+
sampling percentage. These values will be stored in the TSDB target.
|
|
521
|
+
Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
|
|
522
|
+
percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
|
|
523
|
+
"""
|
|
524
|
+
effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
|
|
525
|
+
estimated_prediction_count = effective_sample_count * (
|
|
526
|
+
100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
|
|
527
|
+
)
|
|
528
|
+
return effective_sample_count, estimated_prediction_count
|
|
529
|
+
|
|
510
530
|
|
|
511
531
|
def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
512
532
|
if field is not None:
|
|
@@ -672,6 +692,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
672
692
|
)
|
|
673
693
|
)
|
|
674
694
|
self.first_request[endpoint_id] = True
|
|
695
|
+
|
|
675
696
|
if attributes_to_update:
|
|
676
697
|
logger.info(
|
|
677
698
|
"Updating endpoint record",
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -523,11 +523,12 @@ class _PipelineRunner(abc.ABC):
|
|
|
523
523
|
text = _PipelineRunner._generate_workflow_finished_message(
|
|
524
524
|
run.run_id, errors_counter, run._state
|
|
525
525
|
)
|
|
526
|
-
|
|
527
526
|
notifiers = notifiers or project.notifiers
|
|
528
527
|
if notifiers:
|
|
529
528
|
notifiers.push(text, "info", runs)
|
|
530
529
|
|
|
530
|
+
project.push_pipeline_notification_kfp_runner(run.run_id, run._state, text)
|
|
531
|
+
|
|
531
532
|
if raise_error:
|
|
532
533
|
raise raise_error
|
|
533
534
|
return state or run._state, errors_counter, text
|
|
@@ -620,6 +621,8 @@ class _KFPRunner(_PipelineRunner):
|
|
|
620
621
|
params.update(notification.secret_params)
|
|
621
622
|
project.notifiers.add_notification(notification.kind, params)
|
|
622
623
|
|
|
624
|
+
project.spec.notifications = notifications
|
|
625
|
+
|
|
623
626
|
run_id = _run_pipeline(
|
|
624
627
|
workflow_handler,
|
|
625
628
|
project=project.metadata.name,
|
|
@@ -647,13 +650,23 @@ class _KFPRunner(_PipelineRunner):
|
|
|
647
650
|
exc_info=err_to_str(exc),
|
|
648
651
|
)
|
|
649
652
|
|
|
650
|
-
#
|
|
651
|
-
|
|
653
|
+
# Pushing only relevant notification for the client (ipython and console)
|
|
654
|
+
project.notifiers.push_pipeline_start_message_from_client(
|
|
655
|
+
project.metadata.name, pipeline_id=run_id
|
|
656
|
+
)
|
|
657
|
+
|
|
652
658
|
if context:
|
|
653
659
|
project.notifiers.push_pipeline_start_message(
|
|
654
660
|
project.metadata.name,
|
|
655
661
|
context.uid,
|
|
656
662
|
)
|
|
663
|
+
else:
|
|
664
|
+
project.push_pipeline_notification_kfp_runner(
|
|
665
|
+
run_id,
|
|
666
|
+
mlrun_pipelines.common.models.RunStatuses.running,
|
|
667
|
+
f"Workflow {run_id} started in project {project.metadata.name}",
|
|
668
|
+
notifications,
|
|
669
|
+
)
|
|
657
670
|
pipeline_context.clear()
|
|
658
671
|
return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
|
|
659
672
|
|
mlrun/projects/project.py
CHANGED
|
@@ -83,6 +83,7 @@ from ..artifacts import (
|
|
|
83
83
|
ModelArtifact,
|
|
84
84
|
)
|
|
85
85
|
from ..artifacts.manager import ArtifactManager, dict_to_artifact, extend_artifact_path
|
|
86
|
+
from ..common.runtimes.constants import RunStates
|
|
86
87
|
from ..datastore import store_manager
|
|
87
88
|
from ..features import Feature
|
|
88
89
|
from ..model import EntrypointParam, ImageBuilder, ModelObj
|
|
@@ -851,6 +852,7 @@ class ProjectSpec(ModelObj):
|
|
|
851
852
|
build=None,
|
|
852
853
|
custom_packagers: Optional[list[tuple[str, bool]]] = None,
|
|
853
854
|
default_function_node_selector=None,
|
|
855
|
+
notifications=None,
|
|
854
856
|
):
|
|
855
857
|
self.repo = None
|
|
856
858
|
|
|
@@ -891,6 +893,7 @@ class ProjectSpec(ModelObj):
|
|
|
891
893
|
# whether it is mandatory for a run (raise exception on collection error) or not.
|
|
892
894
|
self.custom_packagers = custom_packagers or []
|
|
893
895
|
self._default_function_node_selector = default_function_node_selector or None
|
|
896
|
+
self.notifications = notifications or []
|
|
894
897
|
|
|
895
898
|
@property
|
|
896
899
|
def source(self) -> str:
|
|
@@ -1172,7 +1175,6 @@ class MlrunProject(ModelObj):
|
|
|
1172
1175
|
self._artifact_manager = None
|
|
1173
1176
|
self._notifiers = CustomNotificationPusher(
|
|
1174
1177
|
[
|
|
1175
|
-
NotificationTypes.slack,
|
|
1176
1178
|
NotificationTypes.console,
|
|
1177
1179
|
NotificationTypes.ipython,
|
|
1178
1180
|
]
|
|
@@ -2137,18 +2139,23 @@ class MlrunProject(ModelObj):
|
|
|
2137
2139
|
db = mlrun.db.get_run_db(secrets=self._secrets)
|
|
2138
2140
|
matching_results = []
|
|
2139
2141
|
alerts = []
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2142
|
+
endpoint_ids = [endpoint.metadata.uid for endpoint in endpoints.endpoints]
|
|
2143
|
+
# using separation to group by endpoint IDs:
|
|
2144
|
+
# {"mep_id1": [...], "mep_id2": [...]}
|
|
2145
|
+
results_by_endpoint = db.get_metrics_by_multiple_endpoints(
|
|
2146
|
+
project=self.name,
|
|
2147
|
+
endpoint_ids=endpoint_ids,
|
|
2148
|
+
type="results",
|
|
2149
|
+
events_format=mm_constants.GetEventsFormat.SEPARATION,
|
|
2150
|
+
)
|
|
2151
|
+
for endpoint_uid, results in results_by_endpoint.items():
|
|
2145
2152
|
results_fqn_by_endpoint = [
|
|
2146
2153
|
get_result_instance_fqn(
|
|
2147
|
-
model_endpoint_id=
|
|
2154
|
+
model_endpoint_id=endpoint_uid,
|
|
2148
2155
|
app_name=result.app,
|
|
2149
2156
|
result_name=result.name,
|
|
2150
2157
|
)
|
|
2151
|
-
for result in
|
|
2158
|
+
for result in results
|
|
2152
2159
|
]
|
|
2153
2160
|
matching_results += filter_results_by_regex(
|
|
2154
2161
|
existing_result_names=results_fqn_by_endpoint,
|
|
@@ -2665,6 +2672,36 @@ class MlrunProject(ModelObj):
|
|
|
2665
2672
|
timeout=timeout,
|
|
2666
2673
|
)
|
|
2667
2674
|
|
|
2675
|
+
def push_pipeline_notification_kfp_runner(
|
|
2676
|
+
self,
|
|
2677
|
+
pipeline_id: str,
|
|
2678
|
+
current_run_state: mlrun_pipelines.common.models.RunStatuses,
|
|
2679
|
+
message: str,
|
|
2680
|
+
notifications: Optional[list] = None,
|
|
2681
|
+
):
|
|
2682
|
+
"""
|
|
2683
|
+
Push notifications for a pipeline run(KFP).
|
|
2684
|
+
|
|
2685
|
+
:param pipeline_id: Unique ID of the pipeline run.
|
|
2686
|
+
:param current_run_state: Current run state of the pipeline.
|
|
2687
|
+
:param message: Message to send in the notification.
|
|
2688
|
+
:param notifications: List of notifications to send.
|
|
2689
|
+
"""
|
|
2690
|
+
current_run_state = RunStates.pipeline_run_status_to_run_state(
|
|
2691
|
+
current_run_state
|
|
2692
|
+
)
|
|
2693
|
+
db = mlrun.get_run_db()
|
|
2694
|
+
notifications = notifications or self.spec.notifications
|
|
2695
|
+
notifications_to_send = []
|
|
2696
|
+
for notification in notifications:
|
|
2697
|
+
if current_run_state in notification.when:
|
|
2698
|
+
notification_copy = notification.copy()
|
|
2699
|
+
notification_copy.message = message
|
|
2700
|
+
notifications_to_send.append(notification_copy)
|
|
2701
|
+
db.push_pipeline_notifications(
|
|
2702
|
+
pipeline_id, self.metadata.name, notifications_to_send
|
|
2703
|
+
)
|
|
2704
|
+
|
|
2668
2705
|
def _instantiate_function(
|
|
2669
2706
|
self,
|
|
2670
2707
|
func: typing.Union[str, mlrun.runtimes.BaseRuntime] = None,
|
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -309,7 +309,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
309
309
|
self,
|
|
310
310
|
stream_path: Optional[str] = None,
|
|
311
311
|
batch: Optional[int] = None,
|
|
312
|
-
|
|
312
|
+
sampling_percentage: float = 100,
|
|
313
313
|
stream_args: Optional[dict] = None,
|
|
314
314
|
tracking_policy: Optional[Union["TrackingPolicy", dict]] = None,
|
|
315
315
|
enable_tracking: bool = True,
|
|
@@ -317,13 +317,13 @@ class ServingRuntime(RemoteRuntime):
|
|
|
317
317
|
"""Apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
|
|
318
318
|
and analyze performance.
|
|
319
319
|
|
|
320
|
-
:param stream_path:
|
|
321
|
-
|
|
322
|
-
:param batch:
|
|
323
|
-
:param
|
|
324
|
-
|
|
325
|
-
:param
|
|
326
|
-
|
|
320
|
+
:param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
|
|
321
|
+
you can use the "dummy://" path for test/simulation.
|
|
322
|
+
:param batch: Deprecated. Micro batch size (send micro batches of N records at a time).
|
|
323
|
+
:param sampling_percentage: Down sampling events that will be pushed to the monitoring stream based on
|
|
324
|
+
a specified percentage. e.g. 50 for 50%. By default, all events are pushed.
|
|
325
|
+
:param stream_args: Stream initialization parameters, e.g. shards, retention_in_hours, ..
|
|
326
|
+
:param enable_tracking: Enabled/Disable model-monitoring tracking. Default True (tracking enabled).
|
|
327
327
|
|
|
328
328
|
Example::
|
|
329
329
|
|
|
@@ -336,12 +336,21 @@ class ServingRuntime(RemoteRuntime):
|
|
|
336
336
|
# Applying model monitoring configurations
|
|
337
337
|
self.spec.track_models = enable_tracking
|
|
338
338
|
|
|
339
|
+
if not 0 < sampling_percentage <= 100:
|
|
340
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
341
|
+
"`sampling_percentage` must be greater than 0 and less or equal to 100."
|
|
342
|
+
)
|
|
343
|
+
self.spec.parameters["sampling_percentage"] = sampling_percentage
|
|
344
|
+
|
|
339
345
|
if stream_path:
|
|
340
346
|
self.spec.parameters["log_stream"] = stream_path
|
|
341
347
|
if batch:
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
348
|
+
warnings.warn(
|
|
349
|
+
"The `batch` size parameter was deprecated in version 1.8.0 and is no longer used. "
|
|
350
|
+
"It will be removed in 1.10.",
|
|
351
|
+
# TODO: Remove this in 1.10
|
|
352
|
+
FutureWarning,
|
|
353
|
+
)
|
|
345
354
|
if stream_args:
|
|
346
355
|
self.spec.parameters["stream_args"] = stream_args
|
|
347
356
|
if tracking_policy is not None:
|