mlrun 1.8.0rc4__py3-none-any.whl → 1.8.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +4 -3
- mlrun/alerts/alert.py +129 -2
- mlrun/artifacts/__init__.py +1 -1
- mlrun/artifacts/base.py +12 -1
- mlrun/artifacts/document.py +59 -38
- mlrun/common/model_monitoring/__init__.py +0 -2
- mlrun/common/model_monitoring/helpers.py +0 -28
- mlrun/common/schemas/__init__.py +1 -4
- mlrun/common/schemas/alert.py +3 -0
- mlrun/common/schemas/artifact.py +4 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/model_monitoring/__init__.py +0 -6
- mlrun/common/schemas/model_monitoring/constants.py +11 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +77 -149
- mlrun/common/schemas/notification.py +6 -0
- mlrun/config.py +0 -2
- mlrun/datastore/datastore_profile.py +57 -17
- mlrun/datastore/vectorstore.py +67 -59
- mlrun/db/base.py +22 -18
- mlrun/db/factory.py +0 -3
- mlrun/db/httpdb.py +122 -150
- mlrun/db/nopdb.py +33 -17
- mlrun/execution.py +43 -29
- mlrun/model.py +7 -0
- mlrun/model_monitoring/__init__.py +3 -2
- mlrun/model_monitoring/api.py +40 -43
- mlrun/model_monitoring/applications/_application_steps.py +4 -2
- mlrun/model_monitoring/applications/base.py +65 -6
- mlrun/model_monitoring/applications/context.py +64 -33
- mlrun/model_monitoring/applications/evidently_base.py +0 -1
- mlrun/model_monitoring/applications/histogram_data_drift.py +2 -6
- mlrun/model_monitoring/controller.py +43 -37
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/tsdb/base.py +2 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +2 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +43 -0
- mlrun/model_monitoring/helpers.py +12 -66
- mlrun/model_monitoring/stream_processing.py +83 -270
- mlrun/model_monitoring/writer.py +1 -10
- mlrun/projects/project.py +87 -74
- mlrun/runtimes/nuclio/function.py +7 -6
- mlrun/runtimes/nuclio/serving.py +7 -1
- mlrun/serving/routers.py +158 -145
- mlrun/serving/server.py +6 -0
- mlrun/serving/states.py +2 -0
- mlrun/serving/v2_serving.py +69 -60
- mlrun/utils/helpers.py +14 -30
- mlrun/utils/notifications/notification/mail.py +36 -9
- mlrun/utils/notifications/notification_pusher.py +34 -13
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/METADATA +5 -4
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/RECORD +56 -69
- mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +0 -149
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/__init__.py +0 -15
- mlrun/model_monitoring/db/stores/base/store.py +0 -154
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -46
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -93
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -47
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -25
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -408
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -464
- mlrun/model_monitoring/model_endpoint.py +0 -120
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc4.dist-info → mlrun-1.8.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -12,26 +12,36 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import json
|
|
16
15
|
import socket
|
|
17
|
-
from typing import Any, Optional, cast
|
|
16
|
+
from typing import Any, Optional, Protocol, cast
|
|
18
17
|
|
|
18
|
+
import nuclio.request
|
|
19
19
|
import numpy as np
|
|
20
20
|
import pandas as pd
|
|
21
21
|
|
|
22
22
|
import mlrun.common.constants as mlrun_constants
|
|
23
23
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
24
|
+
import mlrun.errors
|
|
24
25
|
import mlrun.feature_store as fstore
|
|
25
26
|
import mlrun.features
|
|
26
27
|
import mlrun.serving
|
|
27
28
|
import mlrun.utils
|
|
28
29
|
from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
|
|
29
|
-
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
30
|
+
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
31
|
+
from mlrun.common.schemas import ModelEndpoint
|
|
30
32
|
from mlrun.model_monitoring.helpers import (
|
|
31
33
|
calculate_inputs_statistics,
|
|
32
|
-
get_endpoint_record,
|
|
33
34
|
)
|
|
34
|
-
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class _ArtifactsLogger(Protocol):
|
|
38
|
+
"""
|
|
39
|
+
Classes that implement this protocol are :code:`MlrunProject` and :code:`MLClientCtx`.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def log_artifact(self, *args, **kwargs) -> Artifact: ...
|
|
43
|
+
def log_dataset(self, *args, **kwargs) -> DatasetArtifact: ...
|
|
44
|
+
def log_model(self, *args, **kwargs) -> ModelArtifact: ...
|
|
35
45
|
|
|
36
46
|
|
|
37
47
|
class MonitoringApplicationContext:
|
|
@@ -52,6 +62,7 @@ class MonitoringApplicationContext:
|
|
|
52
62
|
:param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
|
|
53
63
|
:param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
|
|
54
64
|
:param endpoint_id: (str) ID of the monitored model endpoint
|
|
65
|
+
:param endpoint_name: (str) Name of the monitored model endpoint
|
|
55
66
|
:param output_stream_uri: (str) URI of the output stream for results
|
|
56
67
|
:param model_endpoint: (ModelEndpoint) The model endpoint object.
|
|
57
68
|
:param feature_names: (list[str]) List of models feature names.
|
|
@@ -60,36 +71,57 @@ class MonitoringApplicationContext:
|
|
|
60
71
|
and a list of extra data items.
|
|
61
72
|
"""
|
|
62
73
|
|
|
74
|
+
_logger_name = "monitoring-application"
|
|
75
|
+
|
|
63
76
|
def __init__(
|
|
64
77
|
self,
|
|
65
78
|
*,
|
|
66
|
-
graph_context: mlrun.serving.GraphContext,
|
|
67
79
|
application_name: str,
|
|
68
80
|
event: dict[str, Any],
|
|
69
|
-
model_endpoint_dict: dict[str, ModelEndpoint],
|
|
81
|
+
model_endpoint_dict: Optional[dict[str, ModelEndpoint]] = None,
|
|
82
|
+
logger: Optional[mlrun.utils.Logger] = None,
|
|
83
|
+
graph_context: Optional[mlrun.serving.GraphContext] = None,
|
|
84
|
+
artifacts_logger: Optional[_ArtifactsLogger] = None,
|
|
70
85
|
) -> None:
|
|
71
86
|
"""
|
|
72
|
-
Initialize a
|
|
87
|
+
Initialize a :code:`MonitoringApplicationContext` object.
|
|
73
88
|
Note: this object should not be instantiated manually.
|
|
74
89
|
|
|
75
90
|
:param application_name: The application name.
|
|
76
91
|
:param event: The instance data dictionary.
|
|
77
|
-
:param model_endpoint_dict:
|
|
92
|
+
:param model_endpoint_dict: Optional - dictionary of model endpoints.
|
|
93
|
+
:param logger: Optional - MLRun logger instance.
|
|
94
|
+
:param graph_context: Optional - GraphContext instance.
|
|
95
|
+
:param artifacts_logger: Optional - an object that can log artifacts,
|
|
96
|
+
typically :py:class:`~mlrun.projects.MlrunProject` or
|
|
97
|
+
:py:class:`~mlrun.execution.MLClientCtx`.
|
|
78
98
|
"""
|
|
79
99
|
self.application_name = application_name
|
|
80
100
|
|
|
81
|
-
|
|
82
|
-
|
|
101
|
+
if graph_context:
|
|
102
|
+
self.project_name = graph_context.project
|
|
103
|
+
self.project = mlrun.load_project(url=self.project_name)
|
|
104
|
+
else:
|
|
105
|
+
self.project = cast("mlrun.MlrunProject", mlrun.get_current_project())
|
|
106
|
+
self.project_name = self.project.name
|
|
107
|
+
|
|
108
|
+
self._artifacts_logger: _ArtifactsLogger = artifacts_logger or self.project
|
|
83
109
|
|
|
84
110
|
# MLRun Logger
|
|
85
|
-
self.logger = mlrun.utils.create_logger(
|
|
111
|
+
self.logger = logger or mlrun.utils.create_logger(
|
|
86
112
|
level=mlrun.mlconf.log_level,
|
|
87
113
|
formatter_kind=mlrun.mlconf.log_formatter,
|
|
88
|
-
name=
|
|
114
|
+
name=self._logger_name,
|
|
89
115
|
)
|
|
90
116
|
# Nuclio logger - `nuclio.request.Logger`.
|
|
91
|
-
# Note: this logger
|
|
92
|
-
self.nuclio_logger =
|
|
117
|
+
# Note: this logger accepts keyword arguments only in its `_with` methods, e.g. `info_with`.
|
|
118
|
+
self.nuclio_logger = (
|
|
119
|
+
graph_context.logger
|
|
120
|
+
if graph_context
|
|
121
|
+
else nuclio.request.Logger(
|
|
122
|
+
level=mlrun.mlconf.log_level, name=self._logger_name
|
|
123
|
+
)
|
|
124
|
+
)
|
|
93
125
|
|
|
94
126
|
# event data
|
|
95
127
|
self.start_infer_time = pd.Timestamp(
|
|
@@ -101,6 +133,9 @@ class MonitoringApplicationContext:
|
|
|
101
133
|
self.endpoint_id = cast(
|
|
102
134
|
str, event.get(mm_constants.ApplicationEvent.ENDPOINT_ID)
|
|
103
135
|
)
|
|
136
|
+
self.endpoint_name = cast(
|
|
137
|
+
str, event.get(mm_constants.ApplicationEvent.ENDPOINT_NAME)
|
|
138
|
+
)
|
|
104
139
|
self.output_stream_uri = cast(
|
|
105
140
|
str, event.get(mm_constants.ApplicationEvent.OUTPUT_STREAM_URI)
|
|
106
141
|
)
|
|
@@ -113,8 +148,8 @@ class MonitoringApplicationContext:
|
|
|
113
148
|
|
|
114
149
|
# Persistent data - fetched when needed
|
|
115
150
|
self._sample_df: Optional[pd.DataFrame] = None
|
|
116
|
-
self._model_endpoint: Optional[ModelEndpoint] =
|
|
117
|
-
self.endpoint_id
|
|
151
|
+
self._model_endpoint: Optional[ModelEndpoint] = (
|
|
152
|
+
model_endpoint_dict.get(self.endpoint_id) if model_endpoint_dict else None
|
|
118
153
|
)
|
|
119
154
|
|
|
120
155
|
def _get_default_labels(self) -> dict[str, str]:
|
|
@@ -133,7 +168,7 @@ class MonitoringApplicationContext:
|
|
|
133
168
|
def sample_df(self) -> pd.DataFrame:
|
|
134
169
|
if self._sample_df is None:
|
|
135
170
|
feature_set = fstore.get_feature_set(
|
|
136
|
-
self.model_endpoint.
|
|
171
|
+
self.model_endpoint.spec.monitoring_feature_set_uri
|
|
137
172
|
)
|
|
138
173
|
features = [f"{feature_set.metadata.name}.*"]
|
|
139
174
|
vector = fstore.FeatureVector(
|
|
@@ -155,16 +190,18 @@ class MonitoringApplicationContext:
|
|
|
155
190
|
@property
|
|
156
191
|
def model_endpoint(self) -> ModelEndpoint:
|
|
157
192
|
if not self._model_endpoint:
|
|
158
|
-
self._model_endpoint =
|
|
159
|
-
|
|
193
|
+
self._model_endpoint = mlrun.db.get_run_db().get_model_endpoint(
|
|
194
|
+
name=self.endpoint_name,
|
|
195
|
+
project=self.project_name,
|
|
196
|
+
endpoint_id=self.endpoint_id,
|
|
197
|
+
feature_analysis=True,
|
|
160
198
|
)
|
|
161
199
|
return self._model_endpoint
|
|
162
200
|
|
|
163
201
|
@property
|
|
164
202
|
def feature_stats(self) -> FeatureStats:
|
|
165
203
|
if not self._feature_stats:
|
|
166
|
-
self._feature_stats =
|
|
167
|
-
pad_features_hist(self._feature_stats)
|
|
204
|
+
self._feature_stats = self.model_endpoint.spec.feature_stats
|
|
168
205
|
return self._feature_stats
|
|
169
206
|
|
|
170
207
|
@property
|
|
@@ -179,18 +216,12 @@ class MonitoringApplicationContext:
|
|
|
179
216
|
@property
|
|
180
217
|
def feature_names(self) -> list[str]:
|
|
181
218
|
"""The feature names of the model"""
|
|
182
|
-
|
|
183
|
-
return (
|
|
184
|
-
feature_names
|
|
185
|
-
if isinstance(feature_names, list)
|
|
186
|
-
else json.loads(feature_names)
|
|
187
|
-
)
|
|
219
|
+
return self.model_endpoint.spec.feature_names
|
|
188
220
|
|
|
189
221
|
@property
|
|
190
222
|
def label_names(self) -> list[str]:
|
|
191
223
|
"""The label names of the model"""
|
|
192
|
-
|
|
193
|
-
return label_names if isinstance(label_names, list) else json.loads(label_names)
|
|
224
|
+
return self.model_endpoint.spec.label_names
|
|
194
225
|
|
|
195
226
|
@property
|
|
196
227
|
def model(self) -> tuple[str, ModelArtifact, dict]:
|
|
@@ -237,7 +268,7 @@ class MonitoringApplicationContext:
|
|
|
237
268
|
See :func:`~mlrun.projects.MlrunProject.log_artifact` for the documentation.
|
|
238
269
|
"""
|
|
239
270
|
labels = self._add_default_labels(labels)
|
|
240
|
-
return self.
|
|
271
|
+
return self._artifacts_logger.log_artifact(
|
|
241
272
|
item,
|
|
242
273
|
body=body,
|
|
243
274
|
tag=tag,
|
|
@@ -272,7 +303,7 @@ class MonitoringApplicationContext:
|
|
|
272
303
|
See :func:`~mlrun.projects.MlrunProject.log_dataset` for the documentation.
|
|
273
304
|
"""
|
|
274
305
|
labels = self._add_default_labels(labels)
|
|
275
|
-
return self.
|
|
306
|
+
return self._artifacts_logger.log_dataset(
|
|
276
307
|
key,
|
|
277
308
|
df,
|
|
278
309
|
tag=tag,
|
|
@@ -317,7 +348,7 @@ class MonitoringApplicationContext:
|
|
|
317
348
|
See :func:`~mlrun.projects.MlrunProject.log_model` for the documentation.
|
|
318
349
|
"""
|
|
319
350
|
labels = self._add_default_labels(labels)
|
|
320
|
-
return self.
|
|
351
|
+
return self._artifacts_logger.log_model(
|
|
321
352
|
key,
|
|
322
353
|
body=body,
|
|
323
354
|
framework=framework,
|
|
@@ -76,7 +76,6 @@ class EvidentlyModelMonitoringApplicationBase(
|
|
|
76
76
|
|
|
77
77
|
:param evidently_workspace_path: (str) The path to the Evidently workspace.
|
|
78
78
|
:param evidently_project_id: (str) The ID of the Evidently project.
|
|
79
|
-
|
|
80
79
|
"""
|
|
81
80
|
|
|
82
81
|
# TODO : more then one project (mep -> project)
|
|
@@ -113,7 +113,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
113
113
|
|
|
114
114
|
project.enable_model_monitoring()
|
|
115
115
|
|
|
116
|
-
To avoid it, pass
|
|
116
|
+
To avoid it, pass :code:`deploy_histogram_data_drift_app=False`.
|
|
117
117
|
"""
|
|
118
118
|
|
|
119
119
|
NAME: Final[str] = HistogramDataDriftApplicationConstants.NAME
|
|
@@ -331,8 +331,7 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
331
331
|
)
|
|
332
332
|
|
|
333
333
|
def do_tracking(
|
|
334
|
-
self,
|
|
335
|
-
monitoring_context: mm_context.MonitoringApplicationContext,
|
|
334
|
+
self, monitoring_context: mm_context.MonitoringApplicationContext
|
|
336
335
|
) -> list[
|
|
337
336
|
Union[
|
|
338
337
|
mm_results.ModelMonitoringApplicationResult,
|
|
@@ -342,9 +341,6 @@ class HistogramDataDriftApplication(ModelMonitoringApplicationBase):
|
|
|
342
341
|
]:
|
|
343
342
|
"""
|
|
344
343
|
Calculate and return the data drift metrics, averaged over the features.
|
|
345
|
-
|
|
346
|
-
Refer to `ModelMonitoringApplicationBaseV2` for the meaning of the
|
|
347
|
-
function arguments.
|
|
348
344
|
"""
|
|
349
345
|
monitoring_context.logger.debug("Starting to run the application")
|
|
350
346
|
if not monitoring_context.feature_stats:
|
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
from collections.abc import Iterator
|
|
20
20
|
from contextlib import AbstractContextManager
|
|
21
21
|
from types import TracebackType
|
|
22
|
-
from typing import
|
|
22
|
+
from typing import NamedTuple, Optional, cast
|
|
23
23
|
|
|
24
24
|
import nuclio_sdk
|
|
25
25
|
|
|
@@ -27,6 +27,7 @@ import mlrun
|
|
|
27
27
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
28
28
|
import mlrun.feature_store as fstore
|
|
29
29
|
import mlrun.model_monitoring
|
|
30
|
+
from mlrun.common.schemas import EndpointType
|
|
30
31
|
from mlrun.datastore import get_stream_pusher
|
|
31
32
|
from mlrun.errors import err_to_str
|
|
32
33
|
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
@@ -65,7 +66,7 @@ class _BatchWindow:
|
|
|
65
66
|
self._start = self._get_last_analyzed()
|
|
66
67
|
|
|
67
68
|
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
68
|
-
return self._db.get_application_time(self._application)
|
|
69
|
+
return cast(int, self._db.get_application_time(self._application))
|
|
69
70
|
|
|
70
71
|
def _update_last_analyzed(self, last_analyzed: int) -> None:
|
|
71
72
|
self._db.update_application_time(
|
|
@@ -161,18 +162,20 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
161
162
|
)
|
|
162
163
|
|
|
163
164
|
@classmethod
|
|
164
|
-
def _get_last_updated_time(
|
|
165
|
+
def _get_last_updated_time(
|
|
166
|
+
cls, last_request: datetime.datetime, not_batch_endpoint: bool
|
|
167
|
+
) -> int:
|
|
165
168
|
"""
|
|
166
169
|
Get the last updated time of a model endpoint.
|
|
167
170
|
"""
|
|
168
171
|
last_updated = int(
|
|
169
|
-
|
|
172
|
+
last_request.timestamp()
|
|
170
173
|
- cast(
|
|
171
174
|
float,
|
|
172
175
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
173
176
|
)
|
|
174
177
|
)
|
|
175
|
-
if not
|
|
178
|
+
if not not_batch_endpoint:
|
|
176
179
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
177
180
|
# the minimum between the current time and the last updated time.
|
|
178
181
|
# This compensates for the bumping mechanism - see
|
|
@@ -183,17 +186,13 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
183
186
|
)
|
|
184
187
|
return last_updated
|
|
185
188
|
|
|
186
|
-
@staticmethod
|
|
187
|
-
def _date_string2timestamp(date_string: str) -> int:
|
|
188
|
-
return int(datetime.datetime.fromisoformat(date_string).timestamp())
|
|
189
|
-
|
|
190
189
|
def get_intervals(
|
|
191
190
|
self,
|
|
192
191
|
*,
|
|
193
192
|
application: str,
|
|
194
|
-
first_request:
|
|
195
|
-
last_request:
|
|
196
|
-
|
|
193
|
+
first_request: datetime.datetime,
|
|
194
|
+
last_request: datetime.datetime,
|
|
195
|
+
not_batch_endpoint: bool,
|
|
197
196
|
) -> Iterator[_Interval]:
|
|
198
197
|
"""
|
|
199
198
|
Get the batch window for a specific endpoint and application.
|
|
@@ -204,8 +203,8 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
204
203
|
schedules_file=self._schedules_file,
|
|
205
204
|
application=application,
|
|
206
205
|
timedelta_seconds=self._timedelta,
|
|
207
|
-
last_updated=self._get_last_updated_time(last_request,
|
|
208
|
-
first_request=
|
|
206
|
+
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
207
|
+
first_request=int(first_request.timestamp()),
|
|
209
208
|
)
|
|
210
209
|
yield from batch_window.get_intervals()
|
|
211
210
|
|
|
@@ -235,8 +234,6 @@ class MonitoringApplicationController:
|
|
|
235
234
|
|
|
236
235
|
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
237
236
|
|
|
238
|
-
self.db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
239
|
-
|
|
240
237
|
self._window_length = _get_window_length()
|
|
241
238
|
|
|
242
239
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
@@ -253,19 +250,16 @@ class MonitoringApplicationController:
|
|
|
253
250
|
return access_key
|
|
254
251
|
|
|
255
252
|
@staticmethod
|
|
256
|
-
def _should_monitor_endpoint(endpoint:
|
|
253
|
+
def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
|
|
257
254
|
return (
|
|
258
|
-
# Is the model endpoint active?
|
|
259
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
260
255
|
# Is the model endpoint monitored?
|
|
261
|
-
|
|
262
|
-
== mm_constants.ModelMonitoringMode.enabled
|
|
256
|
+
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
263
257
|
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
264
|
-
and endpoint
|
|
265
|
-
and endpoint
|
|
258
|
+
and endpoint.status.first_request
|
|
259
|
+
and endpoint.status.last_request
|
|
266
260
|
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
267
|
-
and
|
|
268
|
-
!= mm_constants.EndpointType.ROUTER
|
|
261
|
+
and endpoint.metadata.endpoint_type.value
|
|
262
|
+
!= mm_constants.EndpointType.ROUTER.value
|
|
269
263
|
)
|
|
270
264
|
|
|
271
265
|
def run(self) -> None:
|
|
@@ -281,7 +275,10 @@ class MonitoringApplicationController:
|
|
|
281
275
|
logger.info("Start running monitoring controller")
|
|
282
276
|
try:
|
|
283
277
|
applications_names = []
|
|
284
|
-
|
|
278
|
+
endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
|
|
279
|
+
project=self.project, tsdb_metrics=True
|
|
280
|
+
)
|
|
281
|
+
endpoints = endpoints_list.endpoints
|
|
285
282
|
if not endpoints:
|
|
286
283
|
logger.info("No model endpoints found", project=self.project)
|
|
287
284
|
return
|
|
@@ -333,12 +330,19 @@ class MonitoringApplicationController:
|
|
|
333
330
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
334
331
|
storage_options=self.storage_options,
|
|
335
332
|
)
|
|
333
|
+
else:
|
|
334
|
+
logger.debug(
|
|
335
|
+
"Skipping endpoint, not ready or not suitable for monitoring",
|
|
336
|
+
endpoint_id=endpoint.metadata.uid,
|
|
337
|
+
endpoint_name=endpoint.metadata.name,
|
|
338
|
+
)
|
|
339
|
+
logger.info("Finished running monitoring controller")
|
|
336
340
|
|
|
337
341
|
@classmethod
|
|
338
342
|
def model_endpoint_process(
|
|
339
343
|
cls,
|
|
340
344
|
project: str,
|
|
341
|
-
endpoint:
|
|
345
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
342
346
|
applications_names: list[str],
|
|
343
347
|
window_length: int,
|
|
344
348
|
model_monitoring_access_key: str,
|
|
@@ -356,11 +360,11 @@ class MonitoringApplicationController:
|
|
|
356
360
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
357
361
|
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
358
362
|
"""
|
|
359
|
-
endpoint_id = endpoint
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
363
|
+
endpoint_id = endpoint.metadata.uid
|
|
364
|
+
not_batch_endpoint = not (
|
|
365
|
+
endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
|
|
363
366
|
)
|
|
367
|
+
m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
|
|
364
368
|
try:
|
|
365
369
|
with _BatchWindowGenerator(
|
|
366
370
|
project=project, endpoint_id=endpoint_id, window_length=window_length
|
|
@@ -371,11 +375,9 @@ class MonitoringApplicationController:
|
|
|
371
375
|
end_infer_time,
|
|
372
376
|
) in batch_window_generator.get_intervals(
|
|
373
377
|
application=application,
|
|
374
|
-
first_request=endpoint
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
378
|
-
has_stream=has_stream,
|
|
378
|
+
first_request=endpoint.status.first_request,
|
|
379
|
+
last_request=endpoint.status.last_request,
|
|
380
|
+
not_batch_endpoint=not_batch_endpoint,
|
|
379
381
|
):
|
|
380
382
|
df = m_fs.to_dataframe(
|
|
381
383
|
start_time=start_infer_time,
|
|
@@ -401,15 +403,17 @@ class MonitoringApplicationController:
|
|
|
401
403
|
start_infer_time=start_infer_time,
|
|
402
404
|
end_infer_time=end_infer_time,
|
|
403
405
|
endpoint_id=endpoint_id,
|
|
406
|
+
endpoint_name=endpoint.metadata.name,
|
|
404
407
|
project=project,
|
|
405
408
|
applications_names=[application],
|
|
406
409
|
model_monitoring_access_key=model_monitoring_access_key,
|
|
407
410
|
)
|
|
411
|
+
logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
|
|
408
412
|
|
|
409
413
|
except Exception:
|
|
410
414
|
logger.exception(
|
|
411
415
|
"Encountered an exception",
|
|
412
|
-
endpoint_id=endpoint
|
|
416
|
+
endpoint_id=endpoint.metadata.uid,
|
|
413
417
|
)
|
|
414
418
|
|
|
415
419
|
@staticmethod
|
|
@@ -417,6 +421,7 @@ class MonitoringApplicationController:
|
|
|
417
421
|
start_infer_time: datetime.datetime,
|
|
418
422
|
end_infer_time: datetime.datetime,
|
|
419
423
|
endpoint_id: str,
|
|
424
|
+
endpoint_name: str,
|
|
420
425
|
project: str,
|
|
421
426
|
applications_names: list[str],
|
|
422
427
|
model_monitoring_access_key: str,
|
|
@@ -440,6 +445,7 @@ class MonitoringApplicationController:
|
|
|
440
445
|
sep=" ", timespec="microseconds"
|
|
441
446
|
),
|
|
442
447
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
448
|
+
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
443
449
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
444
450
|
project=project,
|
|
445
451
|
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
@@ -12,7 +12,5 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .stores import ObjectStoreFactory, get_store_object
|
|
16
|
-
from .stores.base import StoreBase
|
|
17
15
|
from .tsdb import get_tsdb_connector
|
|
18
16
|
from .tsdb.base import TSDBConnector
|
|
@@ -47,7 +47,7 @@ class TSDBConnector(ABC):
|
|
|
47
47
|
self.project = project
|
|
48
48
|
|
|
49
49
|
@abstractmethod
|
|
50
|
-
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
50
|
+
def apply_monitoring_stream_steps(self, graph, **kwargs) -> None:
|
|
51
51
|
"""
|
|
52
52
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
53
53
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -294,6 +294,7 @@ class TSDBConnector(ABC):
|
|
|
294
294
|
) -> pd.DataFrame:
|
|
295
295
|
"""
|
|
296
296
|
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
297
|
+
in the provided time range, which by default is the last 24 hours.
|
|
297
298
|
|
|
298
299
|
:param endpoint_ids: A list of model endpoint identifiers.
|
|
299
300
|
:param start: The start time for the query.
|
|
@@ -164,7 +164,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
164
164
|
def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
|
|
165
165
|
return datetime.fromisoformat(val) if isinstance(val, str) else val
|
|
166
166
|
|
|
167
|
-
def apply_monitoring_stream_steps(self, graph):
|
|
167
|
+
def apply_monitoring_stream_steps(self, graph, **kwarg):
|
|
168
168
|
"""
|
|
169
169
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
170
170
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -701,6 +701,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
701
701
|
endpoint_ids = (
|
|
702
702
|
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
703
703
|
)
|
|
704
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
704
705
|
start, end = self._get_start_end(start, end)
|
|
705
706
|
df = self._get_records(
|
|
706
707
|
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
@@ -168,6 +168,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
168
168
|
tsdb_batching_max_events: int = 1000,
|
|
169
169
|
tsdb_batching_timeout_secs: int = 30,
|
|
170
170
|
sample_window: int = 10,
|
|
171
|
+
aggregate_windows: Optional[list[str]] = None,
|
|
172
|
+
aggregate_period: str = "1m",
|
|
173
|
+
**kwarg,
|
|
171
174
|
):
|
|
172
175
|
"""
|
|
173
176
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -178,7 +181,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
178
181
|
- endpoint_features (Prediction and feature names and values)
|
|
179
182
|
- custom_metrics (user-defined metrics)
|
|
180
183
|
"""
|
|
184
|
+
aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
181
185
|
|
|
186
|
+
# Calculate number of predictions and average latency
|
|
187
|
+
def apply_storey_aggregations():
|
|
188
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
189
|
+
graph.add_step(
|
|
190
|
+
class_name="storey.AggregateByKey",
|
|
191
|
+
aggregates=[
|
|
192
|
+
{
|
|
193
|
+
"name": EventFieldType.LATENCY,
|
|
194
|
+
"column": EventFieldType.LATENCY,
|
|
195
|
+
"operations": ["count", "avg"],
|
|
196
|
+
"windows": aggregate_windows,
|
|
197
|
+
"period": aggregate_period,
|
|
198
|
+
}
|
|
199
|
+
],
|
|
200
|
+
name=EventFieldType.LATENCY,
|
|
201
|
+
after="MapFeatureNames",
|
|
202
|
+
step_name="Aggregates",
|
|
203
|
+
table=".",
|
|
204
|
+
key_field=EventFieldType.ENDPOINT_ID,
|
|
205
|
+
)
|
|
206
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
207
|
+
graph.add_step(
|
|
208
|
+
class_name="storey.Rename",
|
|
209
|
+
mapping={
|
|
210
|
+
"latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
211
|
+
"latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
212
|
+
},
|
|
213
|
+
name="Rename",
|
|
214
|
+
after=EventFieldType.LATENCY,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
apply_storey_aggregations()
|
|
182
218
|
# Write latency per prediction, labeled by endpoint ID only
|
|
183
219
|
graph.add_step(
|
|
184
220
|
"storey.TSDBTarget",
|
|
@@ -853,6 +889,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
853
889
|
endpoint_ids = (
|
|
854
890
|
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
855
891
|
)
|
|
892
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
856
893
|
start, end = self._get_start_end(start, end)
|
|
857
894
|
df = self._get_records(
|
|
858
895
|
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
@@ -864,4 +901,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
864
901
|
)
|
|
865
902
|
if not df.empty:
|
|
866
903
|
df.dropna(inplace=True)
|
|
904
|
+
df.rename(
|
|
905
|
+
columns={
|
|
906
|
+
f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
|
|
907
|
+
},
|
|
908
|
+
inplace=True,
|
|
909
|
+
)
|
|
867
910
|
return df.reset_index(drop=True)
|