mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +34 -30
- mlrun/common/schemas/alert.py +3 -0
- mlrun/common/schemas/model_monitoring/constants.py +4 -0
- mlrun/common/schemas/notification.py +4 -3
- mlrun/datastore/alibaba_oss.py +2 -2
- mlrun/datastore/azure_blob.py +124 -31
- mlrun/datastore/base.py +1 -1
- mlrun/datastore/dbfs_store.py +2 -2
- mlrun/datastore/google_cloud_storage.py +83 -20
- mlrun/datastore/s3.py +2 -2
- mlrun/datastore/sources.py +54 -0
- mlrun/datastore/targets.py +9 -53
- mlrun/db/httpdb.py +6 -1
- mlrun/errors.py +8 -0
- mlrun/execution.py +7 -0
- mlrun/feature_store/api.py +5 -0
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/retrieval/job.py +1 -0
- mlrun/model.py +29 -3
- mlrun/model_monitoring/api.py +9 -0
- mlrun/model_monitoring/applications/_application_steps.py +36 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +15 -13
- mlrun/model_monitoring/controller.py +15 -11
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +14 -11
- mlrun/model_monitoring/db/tsdb/base.py +121 -1
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +85 -47
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +100 -12
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +214 -36
- mlrun/model_monitoring/helpers.py +16 -17
- mlrun/model_monitoring/stream_processing.py +68 -27
- mlrun/projects/operations.py +1 -1
- mlrun/projects/pipelines.py +19 -30
- mlrun/projects/project.py +76 -52
- mlrun/run.py +8 -6
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/nuclio/api_gateway.py +9 -0
- mlrun/runtimes/nuclio/application/application.py +64 -9
- mlrun/runtimes/nuclio/function.py +1 -1
- mlrun/runtimes/pod.py +2 -2
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +7 -9
- mlrun/serving/v2_serving.py +1 -0
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/helpers.py +21 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/METADATA +14 -11
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/RECORD +52 -52
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/WHEEL +1 -1
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from datetime import datetime
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
16
|
from io import StringIO
|
|
17
17
|
from typing import Literal, Optional, Union
|
|
18
18
|
|
|
@@ -33,7 +33,7 @@ _TSDB_RATE = "1/s"
|
|
|
33
33
|
_CONTAINER = "users"
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def _is_no_schema_error(exc: v3io_frames.
|
|
36
|
+
def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
|
|
37
37
|
"""
|
|
38
38
|
In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
|
|
39
39
|
Check if the error message contains the relevant string to verify the cause.
|
|
@@ -89,6 +89,19 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
89
89
|
)
|
|
90
90
|
self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
|
|
91
91
|
|
|
92
|
+
errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
93
|
+
project=self.project,
|
|
94
|
+
kind=mm_schemas.FileTargetKind.ERRORS,
|
|
95
|
+
)
|
|
96
|
+
(
|
|
97
|
+
_,
|
|
98
|
+
_,
|
|
99
|
+
errors_path,
|
|
100
|
+
) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
|
|
101
|
+
errors_table_full_path
|
|
102
|
+
)
|
|
103
|
+
self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
|
|
104
|
+
|
|
92
105
|
monitoring_application_full_path = (
|
|
93
106
|
mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
94
107
|
project=self.project,
|
|
@@ -160,7 +173,6 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
160
173
|
- endpoint_features (Prediction and feature names and values)
|
|
161
174
|
- custom_metrics (user-defined metrics)
|
|
162
175
|
"""
|
|
163
|
-
|
|
164
176
|
# Write latency per prediction, labeled by endpoint ID only
|
|
165
177
|
graph.add_step(
|
|
166
178
|
"storey.TSDBTarget",
|
|
@@ -171,7 +183,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
171
183
|
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
172
184
|
container=self.container,
|
|
173
185
|
v3io_frames=self.v3io_framesd,
|
|
174
|
-
columns=[
|
|
186
|
+
columns=[
|
|
187
|
+
mm_schemas.EventFieldType.LATENCY,
|
|
188
|
+
mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
|
|
189
|
+
],
|
|
175
190
|
index_cols=[
|
|
176
191
|
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
177
192
|
],
|
|
@@ -255,6 +270,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
255
270
|
apply_storey_filter()
|
|
256
271
|
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
257
272
|
|
|
273
|
+
def handle_model_error(
|
|
274
|
+
self,
|
|
275
|
+
graph,
|
|
276
|
+
tsdb_batching_max_events: int = 10,
|
|
277
|
+
tsdb_batching_timeout_secs: int = 60,
|
|
278
|
+
**kwargs,
|
|
279
|
+
) -> None:
|
|
280
|
+
graph.add_step(
|
|
281
|
+
"mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
|
|
282
|
+
name="error_extractor",
|
|
283
|
+
after="ForwardError",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
graph.add_step(
|
|
287
|
+
"storey.TSDBTarget",
|
|
288
|
+
name="tsdb_error",
|
|
289
|
+
after="error_extractor",
|
|
290
|
+
path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
|
|
291
|
+
rate="1/s",
|
|
292
|
+
time_col=mm_schemas.EventFieldType.TIMESTAMP,
|
|
293
|
+
container=self.container,
|
|
294
|
+
v3io_frames=self.v3io_framesd,
|
|
295
|
+
columns=[
|
|
296
|
+
mm_schemas.EventFieldType.MODEL_ERROR,
|
|
297
|
+
mm_schemas.EventFieldType.ERROR_COUNT,
|
|
298
|
+
],
|
|
299
|
+
index_cols=[
|
|
300
|
+
mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
301
|
+
],
|
|
302
|
+
max_events=tsdb_batching_max_events,
|
|
303
|
+
flush_after_seconds=tsdb_batching_timeout_secs,
|
|
304
|
+
key=mm_schemas.EventFieldType.ENDPOINT_ID,
|
|
305
|
+
)
|
|
306
|
+
|
|
258
307
|
def write_application_event(
|
|
259
308
|
self,
|
|
260
309
|
event: dict,
|
|
@@ -277,7 +326,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
277
326
|
elif kind == mm_schemas.WriterEventKind.RESULT:
|
|
278
327
|
table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
279
328
|
index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
|
|
280
|
-
|
|
329
|
+
event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
|
|
330
|
+
# TODO: remove this when extra data is supported (ML-7460)
|
|
331
|
+
event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
|
|
281
332
|
else:
|
|
282
333
|
raise ValueError(f"Invalid {kind = }")
|
|
283
334
|
|
|
@@ -437,7 +488,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
437
488
|
step=sliding_window_step,
|
|
438
489
|
**kwargs,
|
|
439
490
|
)
|
|
440
|
-
except v3io_frames.
|
|
491
|
+
except v3io_frames.Error as err:
|
|
441
492
|
if _is_no_schema_error(err):
|
|
442
493
|
return pd.DataFrame()
|
|
443
494
|
else:
|
|
@@ -504,10 +555,16 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
504
555
|
if type == "metrics":
|
|
505
556
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
506
557
|
name = mm_schemas.MetricData.METRIC_NAME
|
|
558
|
+
columns = [mm_schemas.MetricData.METRIC_VALUE]
|
|
507
559
|
df_handler = self.df_to_metrics_values
|
|
508
560
|
elif type == "results":
|
|
509
561
|
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
510
562
|
name = mm_schemas.ResultData.RESULT_NAME
|
|
563
|
+
columns = [
|
|
564
|
+
mm_schemas.ResultData.RESULT_VALUE,
|
|
565
|
+
mm_schemas.ResultData.RESULT_STATUS,
|
|
566
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
567
|
+
]
|
|
511
568
|
df_handler = self.df_to_results_values
|
|
512
569
|
else:
|
|
513
570
|
raise ValueError(f"Invalid {type = }")
|
|
@@ -517,6 +574,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
517
574
|
metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
|
|
518
575
|
table_path=table_path,
|
|
519
576
|
name=name,
|
|
577
|
+
columns=columns,
|
|
520
578
|
)
|
|
521
579
|
|
|
522
580
|
logger.debug("Querying V3IO TSDB", query=query)
|
|
@@ -627,33 +685,153 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
627
685
|
), # pyright: ignore[reportArgumentType]
|
|
628
686
|
)
|
|
629
687
|
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
688
|
+
def get_last_request(
|
|
689
|
+
self,
|
|
690
|
+
endpoint_ids: Union[str, list[str]],
|
|
691
|
+
start: Union[datetime, str] = "0",
|
|
692
|
+
end: Union[datetime, str] = "now",
|
|
693
|
+
) -> pd.DataFrame:
|
|
694
|
+
endpoint_ids = (
|
|
695
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
696
|
+
)
|
|
697
|
+
df = self._get_records(
|
|
698
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
699
|
+
start=start,
|
|
700
|
+
end=end,
|
|
701
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
702
|
+
agg_funcs=["last"],
|
|
703
|
+
)
|
|
704
|
+
if not df.empty:
|
|
705
|
+
df.rename(
|
|
706
|
+
columns={
|
|
707
|
+
f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
|
|
708
|
+
f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
|
|
709
|
+
},
|
|
710
|
+
inplace=True,
|
|
711
|
+
)
|
|
712
|
+
df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
|
|
713
|
+
mm_schemas.EventFieldType.LAST_REQUEST
|
|
714
|
+
].map(
|
|
715
|
+
lambda last_request: datetime.fromtimestamp(
|
|
716
|
+
last_request, tz=timezone.utc
|
|
717
|
+
)
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
return df.reset_index(drop=True)
|
|
721
|
+
|
|
722
|
+
def get_drift_status(
|
|
723
|
+
self,
|
|
724
|
+
endpoint_ids: Union[str, list[str]],
|
|
725
|
+
start: Union[datetime, str] = "now-24h",
|
|
726
|
+
end: Union[datetime, str] = "now",
|
|
727
|
+
) -> pd.DataFrame:
|
|
728
|
+
endpoint_ids = (
|
|
729
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
730
|
+
)
|
|
731
|
+
df = self._get_records(
|
|
732
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
733
|
+
start=start,
|
|
734
|
+
end=end,
|
|
735
|
+
columns=[mm_schemas.ResultData.RESULT_STATUS],
|
|
736
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
737
|
+
agg_funcs=["max"],
|
|
738
|
+
group_by="endpoint_id",
|
|
739
|
+
)
|
|
740
|
+
if not df.empty:
|
|
741
|
+
df.columns = [
|
|
742
|
+
col[len("max(") : -1] if "max(" in col else col for col in df.columns
|
|
743
|
+
]
|
|
744
|
+
return df.reset_index(drop=True)
|
|
745
|
+
|
|
746
|
+
def get_metrics_metadata(
|
|
747
|
+
self,
|
|
748
|
+
endpoint_id: str,
|
|
749
|
+
start: Union[datetime, str] = "0",
|
|
750
|
+
end: Union[datetime, str] = "now",
|
|
751
|
+
) -> pd.DataFrame:
|
|
752
|
+
df = self._get_records(
|
|
753
|
+
table=mm_schemas.V3IOTSDBTables.METRICS,
|
|
754
|
+
start=start,
|
|
755
|
+
end=end,
|
|
756
|
+
columns=[mm_schemas.MetricData.METRIC_VALUE],
|
|
757
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
758
|
+
agg_funcs=["last"],
|
|
759
|
+
)
|
|
760
|
+
if not df.empty:
|
|
761
|
+
df.drop(
|
|
762
|
+
columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
|
|
763
|
+
)
|
|
764
|
+
return df.reset_index(drop=True)
|
|
765
|
+
|
|
766
|
+
def get_results_metadata(
|
|
767
|
+
self,
|
|
768
|
+
endpoint_id: str,
|
|
769
|
+
start: Union[datetime, str] = "0",
|
|
770
|
+
end: Union[datetime, str] = "now",
|
|
771
|
+
) -> pd.DataFrame:
|
|
772
|
+
df = self._get_records(
|
|
773
|
+
table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
|
|
774
|
+
start=start,
|
|
775
|
+
end=end,
|
|
776
|
+
columns=[
|
|
777
|
+
mm_schemas.ResultData.RESULT_KIND,
|
|
778
|
+
],
|
|
779
|
+
filter_query=f"endpoint_id=='{endpoint_id}'",
|
|
780
|
+
agg_funcs=["last"],
|
|
781
|
+
)
|
|
782
|
+
if not df.empty:
|
|
783
|
+
df.rename(
|
|
784
|
+
columns={
|
|
785
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
|
|
786
|
+
},
|
|
787
|
+
inplace=True,
|
|
788
|
+
)
|
|
789
|
+
return df.reset_index(drop=True)
|
|
790
|
+
|
|
791
|
+
def get_error_count(
|
|
792
|
+
self,
|
|
793
|
+
endpoint_ids: Union[str, list[str]],
|
|
794
|
+
start: Union[datetime, str] = "0",
|
|
795
|
+
end: Union[datetime, str] = "now",
|
|
796
|
+
) -> pd.DataFrame:
|
|
797
|
+
endpoint_ids = (
|
|
798
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
799
|
+
)
|
|
800
|
+
df = self._get_records(
|
|
801
|
+
table=mm_schemas.FileTargetKind.ERRORS,
|
|
802
|
+
start=start,
|
|
803
|
+
end=end,
|
|
804
|
+
columns=[mm_schemas.EventFieldType.ERROR_COUNT],
|
|
805
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
806
|
+
agg_funcs=["count"],
|
|
807
|
+
)
|
|
808
|
+
if not df.empty:
|
|
809
|
+
df.rename(
|
|
810
|
+
columns={
|
|
811
|
+
f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
|
|
812
|
+
},
|
|
813
|
+
inplace=True,
|
|
814
|
+
)
|
|
815
|
+
df.dropna(inplace=True)
|
|
816
|
+
return df.reset_index(drop=True)
|
|
817
|
+
|
|
818
|
+
def get_avg_latency(
|
|
819
|
+
self,
|
|
820
|
+
endpoint_ids: Union[str, list[str]],
|
|
821
|
+
start: Union[datetime, str] = "0",
|
|
822
|
+
end: Union[datetime, str] = "now",
|
|
823
|
+
) -> pd.DataFrame:
|
|
824
|
+
endpoint_ids = (
|
|
825
|
+
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
826
|
+
)
|
|
827
|
+
df = self._get_records(
|
|
828
|
+
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
829
|
+
start=start,
|
|
830
|
+
end=end,
|
|
831
|
+
columns=[mm_schemas.EventFieldType.LATENCY],
|
|
832
|
+
filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
|
|
833
|
+
agg_funcs=["avg"],
|
|
834
|
+
)
|
|
835
|
+
if not df.empty:
|
|
836
|
+
df.dropna(inplace=True)
|
|
837
|
+
return df.reset_index(drop=True)
|
|
@@ -20,10 +20,8 @@ import pandas as pd
|
|
|
20
20
|
|
|
21
21
|
import mlrun
|
|
22
22
|
import mlrun.common.model_monitoring.helpers
|
|
23
|
-
import mlrun.common.schemas
|
|
24
|
-
|
|
25
|
-
EventFieldType,
|
|
26
|
-
)
|
|
23
|
+
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
24
|
+
import mlrun.data_types.infer
|
|
27
25
|
from mlrun.common.schemas.model_monitoring.model_endpoints import (
|
|
28
26
|
ModelEndpointMonitoringMetric,
|
|
29
27
|
ModelEndpointMonitoringMetricType,
|
|
@@ -35,7 +33,6 @@ from mlrun.utils import logger
|
|
|
35
33
|
if typing.TYPE_CHECKING:
|
|
36
34
|
from mlrun.db.base import RunDBInterface
|
|
37
35
|
from mlrun.projects import MlrunProject
|
|
38
|
-
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
39
36
|
|
|
40
37
|
|
|
41
38
|
class _BatchDict(typing.TypedDict):
|
|
@@ -45,26 +42,29 @@ class _BatchDict(typing.TypedDict):
|
|
|
45
42
|
|
|
46
43
|
|
|
47
44
|
def get_stream_path(
|
|
48
|
-
project: str,
|
|
45
|
+
project: str,
|
|
46
|
+
function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
|
|
47
|
+
stream_uri: typing.Optional[str] = None,
|
|
49
48
|
) -> str:
|
|
50
49
|
"""
|
|
51
50
|
Get stream path from the project secret. If wasn't set, take it from the system configurations
|
|
52
51
|
|
|
53
52
|
:param project: Project name.
|
|
54
|
-
:param function_name:
|
|
53
|
+
:param function_name: Application name. Default is model_monitoring_stream.
|
|
54
|
+
:param stream_uri: Stream URI. If provided, it will be used instead of the one from the project secret.
|
|
55
55
|
|
|
56
56
|
:return: Monitoring stream path to the relevant application.
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
|
-
stream_uri = mlrun.get_secret_or_env(
|
|
60
|
-
|
|
59
|
+
stream_uri = stream_uri or mlrun.get_secret_or_env(
|
|
60
|
+
mm_constants.ProjectSecretKeys.STREAM_PATH
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
if not stream_uri or stream_uri == "v3io":
|
|
64
64
|
# TODO : remove the first part of this condition in 1.9.0
|
|
65
65
|
stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
|
|
66
66
|
project=project,
|
|
67
|
-
kind=
|
|
67
|
+
kind=mm_constants.FileTargetKind.STREAM,
|
|
68
68
|
target="online",
|
|
69
69
|
function_name=function_name,
|
|
70
70
|
)
|
|
@@ -78,7 +78,7 @@ def get_stream_path(
|
|
|
78
78
|
|
|
79
79
|
def get_monitoring_parquet_path(
|
|
80
80
|
project: "MlrunProject",
|
|
81
|
-
kind: str =
|
|
81
|
+
kind: str = mm_constants.FileTargetKind.PARQUET,
|
|
82
82
|
) -> str:
|
|
83
83
|
"""Get model monitoring parquet target for the current project and kind. The parquet target path is based on the
|
|
84
84
|
project artifact path. If project artifact path is not defined, the parquet target path will be based on MLRun
|
|
@@ -111,7 +111,7 @@ def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -
|
|
|
111
111
|
"""
|
|
112
112
|
|
|
113
113
|
return mlrun.get_secret_or_env(
|
|
114
|
-
key=
|
|
114
|
+
key=mm_constants.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
|
|
115
115
|
secret_provider=secret_provider,
|
|
116
116
|
)
|
|
117
117
|
|
|
@@ -126,7 +126,7 @@ def get_tsdb_connection_string(
|
|
|
126
126
|
"""
|
|
127
127
|
|
|
128
128
|
return mlrun.get_secret_or_env(
|
|
129
|
-
key=
|
|
129
|
+
key=mm_constants.ProjectSecretKeys.TSDB_CONNECTION,
|
|
130
130
|
secret_provider=secret_provider,
|
|
131
131
|
)
|
|
132
132
|
|
|
@@ -200,7 +200,7 @@ def update_model_endpoint_last_request(
|
|
|
200
200
|
db.patch_model_endpoint(
|
|
201
201
|
project=project,
|
|
202
202
|
endpoint_id=model_endpoint.metadata.uid,
|
|
203
|
-
attributes={EventFieldType.LAST_REQUEST: current_request},
|
|
203
|
+
attributes={mm_constants.EventFieldType.LAST_REQUEST: current_request},
|
|
204
204
|
)
|
|
205
205
|
else:
|
|
206
206
|
try:
|
|
@@ -229,7 +229,7 @@ def update_model_endpoint_last_request(
|
|
|
229
229
|
db.patch_model_endpoint(
|
|
230
230
|
project=project,
|
|
231
231
|
endpoint_id=model_endpoint.metadata.uid,
|
|
232
|
-
attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
232
|
+
attributes={mm_constants.EventFieldType.LAST_REQUEST: bumped_last_request},
|
|
233
233
|
)
|
|
234
234
|
|
|
235
235
|
|
|
@@ -249,8 +249,7 @@ def calculate_inputs_statistics(
|
|
|
249
249
|
|
|
250
250
|
# Use `DFDataInfer` to calculate the statistics over the inputs:
|
|
251
251
|
inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
|
|
252
|
-
df=inputs,
|
|
253
|
-
options=mlrun.data_types.infer.InferOptions.Histogram,
|
|
252
|
+
df=inputs, options=mlrun.data_types.infer.InferOptions.Histogram
|
|
254
253
|
)
|
|
255
254
|
|
|
256
255
|
# Recalculate the histograms over the bins that are set in the sample-set of the end point:
|
|
@@ -169,11 +169,40 @@ class EventStreamProcessor:
|
|
|
169
169
|
mlrun.serving.states.RootFlowStep,
|
|
170
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
171
171
|
)
|
|
172
|
+
graph.add_step(
|
|
173
|
+
"ExtractEndpointID",
|
|
174
|
+
"extract_endpoint",
|
|
175
|
+
full_event=True,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# split the graph between event with error vs valid event
|
|
179
|
+
graph.add_step(
|
|
180
|
+
"storey.Filter",
|
|
181
|
+
"FilterError",
|
|
182
|
+
after="extract_endpoint",
|
|
183
|
+
_fn="(event.get('error') is None)",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
graph.add_step(
|
|
187
|
+
"storey.Filter",
|
|
188
|
+
"ForwardError",
|
|
189
|
+
after="extract_endpoint",
|
|
190
|
+
_fn="(event.get('error') is not None)",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
194
|
+
project=self.project, secret_provider=secret_provider
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
tsdb_connector.handle_model_error(
|
|
198
|
+
graph,
|
|
199
|
+
)
|
|
172
200
|
|
|
173
201
|
# Process endpoint event: splitting into sub-events and validate event data
|
|
174
202
|
def apply_process_endpoint_event():
|
|
175
203
|
graph.add_step(
|
|
176
204
|
"ProcessEndpointEvent",
|
|
205
|
+
after="extract_endpoint", # TODO: change this to FilterError in ML-7456
|
|
177
206
|
full_event=True,
|
|
178
207
|
project=self.project,
|
|
179
208
|
)
|
|
@@ -295,9 +324,6 @@ class EventStreamProcessor:
|
|
|
295
324
|
|
|
296
325
|
apply_storey_sample_window()
|
|
297
326
|
|
|
298
|
-
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
299
|
-
project=self.project, secret_provider=secret_provider
|
|
300
|
-
)
|
|
301
327
|
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
302
328
|
|
|
303
329
|
# Parquet branch
|
|
@@ -386,6 +412,38 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
386
412
|
return e
|
|
387
413
|
|
|
388
414
|
|
|
415
|
+
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
416
|
+
def __init__(self, **kwargs) -> None:
|
|
417
|
+
"""
|
|
418
|
+
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
419
|
+
"""
|
|
420
|
+
super().__init__(**kwargs)
|
|
421
|
+
|
|
422
|
+
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
423
|
+
# Getting model version and function uri from event
|
|
424
|
+
# and use them for retrieving the endpoint_id
|
|
425
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
426
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
430
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
version = full_event.body.get(EventFieldType.VERSION)
|
|
434
|
+
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
435
|
+
|
|
436
|
+
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
437
|
+
function_uri=function_uri,
|
|
438
|
+
versioned_model=versioned_model,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
endpoint_id = str(endpoint_id)
|
|
442
|
+
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
443
|
+
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
444
|
+
return full_event
|
|
445
|
+
|
|
446
|
+
|
|
389
447
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
390
448
|
def __init__(self, **kwargs):
|
|
391
449
|
"""
|
|
@@ -459,28 +517,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
459
517
|
def do(self, full_event):
|
|
460
518
|
event = full_event.body
|
|
461
519
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
function_uri = event
|
|
465
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
466
|
-
return None
|
|
467
|
-
|
|
468
|
-
model = event.get(EventFieldType.MODEL)
|
|
469
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
470
|
-
return None
|
|
471
|
-
|
|
472
|
-
version = event.get(EventFieldType.VERSION)
|
|
473
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
474
|
-
|
|
475
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
476
|
-
function_uri=function_uri,
|
|
477
|
-
versioned_model=versioned_model,
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
endpoint_id = str(endpoint_id)
|
|
481
|
-
|
|
482
|
-
event[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
483
|
-
event[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
520
|
+
versioned_model = event[EventFieldType.VERSIONED_MODEL]
|
|
521
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
522
|
+
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
484
523
|
|
|
485
524
|
# In case this process fails, resume state from existing record
|
|
486
525
|
self.resume_state(endpoint_id)
|
|
@@ -488,9 +527,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
488
527
|
# If error key has been found in the current event,
|
|
489
528
|
# increase the error counter by 1 and raise the error description
|
|
490
529
|
error = event.get("error")
|
|
491
|
-
if error:
|
|
530
|
+
if error: # TODO: delete this in ML-7456
|
|
492
531
|
self.error_count[endpoint_id] += 1
|
|
493
|
-
# TODO: write to tsdb / kv once in a while
|
|
494
532
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
495
533
|
|
|
496
534
|
# Validate event fields
|
|
@@ -598,6 +636,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
598
636
|
EventFieldType.PREDICTION: prediction,
|
|
599
637
|
EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
|
|
600
638
|
EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
|
|
639
|
+
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
640
|
+
self.last_request[endpoint_id]
|
|
641
|
+
).timestamp(),
|
|
601
642
|
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
602
643
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
603
644
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
mlrun/projects/operations.py
CHANGED
|
@@ -189,7 +189,7 @@ def run_function(
|
|
|
189
189
|
if engine == "kfp":
|
|
190
190
|
if schedule:
|
|
191
191
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
192
|
-
"Scheduling
|
|
192
|
+
"Scheduling jobs is not supported when running a workflow with the kfp engine."
|
|
193
193
|
)
|
|
194
194
|
return function.as_step(
|
|
195
195
|
name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels
|