mlrun 1.8.0rc5__py3-none-any.whl → 1.8.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/__init__.py +1 -1
- mlrun/artifacts/base.py +12 -1
- mlrun/artifacts/document.py +59 -38
- mlrun/common/model_monitoring/__init__.py +0 -2
- mlrun/common/model_monitoring/helpers.py +0 -28
- mlrun/common/schemas/__init__.py +1 -4
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/model_monitoring/__init__.py +0 -6
- mlrun/common/schemas/model_monitoring/constants.py +11 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +77 -149
- mlrun/common/schemas/notification.py +6 -0
- mlrun/config.py +0 -2
- mlrun/datastore/datastore_profile.py +57 -17
- mlrun/datastore/vectorstore.py +67 -59
- mlrun/db/base.py +22 -18
- mlrun/db/httpdb.py +116 -148
- mlrun/db/nopdb.py +33 -17
- mlrun/execution.py +11 -4
- mlrun/model.py +3 -0
- mlrun/model_monitoring/__init__.py +3 -2
- mlrun/model_monitoring/api.py +40 -43
- mlrun/model_monitoring/applications/_application_steps.py +3 -1
- mlrun/model_monitoring/applications/context.py +15 -17
- mlrun/model_monitoring/controller.py +43 -37
- mlrun/model_monitoring/db/__init__.py +0 -2
- mlrun/model_monitoring/db/tsdb/base.py +2 -1
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +2 -1
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +43 -0
- mlrun/model_monitoring/helpers.py +12 -66
- mlrun/model_monitoring/stream_processing.py +83 -270
- mlrun/model_monitoring/writer.py +1 -10
- mlrun/projects/project.py +63 -55
- mlrun/runtimes/nuclio/function.py +7 -6
- mlrun/runtimes/nuclio/serving.py +7 -1
- mlrun/serving/routers.py +158 -145
- mlrun/serving/server.py +6 -0
- mlrun/serving/states.py +2 -0
- mlrun/serving/v2_serving.py +69 -60
- mlrun/utils/helpers.py +14 -30
- mlrun/utils/notifications/notification/mail.py +17 -6
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/METADATA +1 -1
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/RECORD +47 -60
- mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +0 -149
- mlrun/model_monitoring/db/stores/__init__.py +0 -136
- mlrun/model_monitoring/db/stores/base/__init__.py +0 -15
- mlrun/model_monitoring/db/stores/base/store.py +0 -154
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -46
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -93
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -47
- mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -25
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -408
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -464
- mlrun/model_monitoring/model_endpoint.py +0 -120
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/WHEEL +0 -0
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc5.dist-info → mlrun-1.8.0rc6.dist-info}/top_level.txt +0 -0
mlrun/model_monitoring/api.py
CHANGED
|
@@ -26,11 +26,14 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
26
26
|
import mlrun.feature_store
|
|
27
27
|
import mlrun.model_monitoring.applications as mm_app
|
|
28
28
|
import mlrun.serving
|
|
29
|
+
from mlrun.common.schemas import ModelEndpoint
|
|
30
|
+
from mlrun.common.schemas.model_monitoring import (
|
|
31
|
+
FunctionURI,
|
|
32
|
+
)
|
|
29
33
|
from mlrun.data_types.infer import InferOptions, get_df_stats
|
|
30
34
|
from mlrun.utils import datetime_now, logger
|
|
31
35
|
|
|
32
36
|
from .helpers import update_model_endpoint_last_request
|
|
33
|
-
from .model_endpoint import ModelEndpoint
|
|
34
37
|
|
|
35
38
|
# A union of all supported dataset types:
|
|
36
39
|
DatasetType = typing.Union[
|
|
@@ -46,8 +49,6 @@ def get_or_create_model_endpoint(
|
|
|
46
49
|
function_name: str = "",
|
|
47
50
|
context: mlrun.MLClientCtx = None,
|
|
48
51
|
sample_set_statistics: typing.Optional[dict[str, typing.Any]] = None,
|
|
49
|
-
drift_threshold: typing.Optional[float] = None,
|
|
50
|
-
possible_drift_threshold: typing.Optional[float] = None,
|
|
51
52
|
monitoring_mode: mm_constants.ModelMonitoringMode = mm_constants.ModelMonitoringMode.disabled,
|
|
52
53
|
db_session=None,
|
|
53
54
|
) -> ModelEndpoint:
|
|
@@ -68,10 +69,6 @@ def get_or_create_model_endpoint(
|
|
|
68
69
|
full function hash.
|
|
69
70
|
:param sample_set_statistics: Dictionary of sample set statistics that will be used as a reference data for
|
|
70
71
|
the new model endpoint (applicable only to new endpoint_id).
|
|
71
|
-
:param drift_threshold: (deprecated) The threshold of which to mark drifts (applicable only to new
|
|
72
|
-
endpoint_id).
|
|
73
|
-
:param possible_drift_threshold: (deprecated) The threshold of which to mark possible drifts (applicable only to new
|
|
74
|
-
endpoint_id).
|
|
75
72
|
:param monitoring_mode: If enabled, apply model monitoring features on the provided endpoint id
|
|
76
73
|
(applicable only to new endpoint_id).
|
|
77
74
|
:param db_session: A runtime session that manages the current dialog with the database.
|
|
@@ -79,18 +76,15 @@ def get_or_create_model_endpoint(
|
|
|
79
76
|
:return: A ModelEndpoint object
|
|
80
77
|
"""
|
|
81
78
|
|
|
82
|
-
if not endpoint_id:
|
|
83
|
-
# Generate a new model endpoint id based on the project name and model name
|
|
84
|
-
endpoint_id = hashlib.sha1(
|
|
85
|
-
f"{project}_{model_endpoint_name}".encode()
|
|
86
|
-
).hexdigest()
|
|
87
|
-
|
|
88
79
|
if not db_session:
|
|
89
80
|
# Generate a runtime database
|
|
90
81
|
db_session = mlrun.get_run_db()
|
|
91
82
|
try:
|
|
92
83
|
model_endpoint = db_session.get_model_endpoint(
|
|
93
|
-
project=project,
|
|
84
|
+
project=project,
|
|
85
|
+
name=model_endpoint_name,
|
|
86
|
+
endpoint_id=endpoint_id,
|
|
87
|
+
function_name=function_name,
|
|
94
88
|
)
|
|
95
89
|
# If other fields provided, validate that they are correspond to the existing model endpoint data
|
|
96
90
|
_model_endpoint_validations(
|
|
@@ -104,7 +98,6 @@ def get_or_create_model_endpoint(
|
|
|
104
98
|
model_endpoint = _generate_model_endpoint(
|
|
105
99
|
project=project,
|
|
106
100
|
db_session=db_session,
|
|
107
|
-
endpoint_id=endpoint_id,
|
|
108
101
|
model_path=model_path,
|
|
109
102
|
model_endpoint_name=model_endpoint_name,
|
|
110
103
|
function_name=function_name,
|
|
@@ -208,13 +201,13 @@ def record_results(
|
|
|
208
201
|
monitoring_mode=monitoring_mode,
|
|
209
202
|
db_session=db,
|
|
210
203
|
)
|
|
211
|
-
logger.debug("Model endpoint", endpoint=model_endpoint
|
|
204
|
+
logger.debug("Model endpoint", endpoint=model_endpoint)
|
|
212
205
|
|
|
213
206
|
timestamp = datetime_now()
|
|
214
207
|
if infer_results_df is not None:
|
|
215
208
|
# Write the monitoring parquet to the relevant model endpoint context
|
|
216
209
|
write_monitoring_df(
|
|
217
|
-
feature_set_uri=model_endpoint.
|
|
210
|
+
feature_set_uri=model_endpoint.spec.monitoring_feature_set_uri,
|
|
218
211
|
infer_datetime=timestamp,
|
|
219
212
|
endpoint_id=model_endpoint.metadata.uid,
|
|
220
213
|
infer_results_df=infer_results_df,
|
|
@@ -278,7 +271,7 @@ def _model_endpoint_validations(
|
|
|
278
271
|
# Feature stats
|
|
279
272
|
if (
|
|
280
273
|
sample_set_statistics
|
|
281
|
-
and sample_set_statistics != model_endpoint.
|
|
274
|
+
and sample_set_statistics != model_endpoint.spec.feature_stats
|
|
282
275
|
):
|
|
283
276
|
logger.warning(
|
|
284
277
|
"Provided sample set statistics is different from the registered statistics. "
|
|
@@ -330,7 +323,6 @@ def write_monitoring_df(
|
|
|
330
323
|
def _generate_model_endpoint(
|
|
331
324
|
project: str,
|
|
332
325
|
db_session,
|
|
333
|
-
endpoint_id: str,
|
|
334
326
|
model_path: str,
|
|
335
327
|
model_endpoint_name: str,
|
|
336
328
|
function_name: str,
|
|
@@ -344,7 +336,6 @@ def _generate_model_endpoint(
|
|
|
344
336
|
:param project: Project name.
|
|
345
337
|
|
|
346
338
|
:param db_session: A session that manages the current dialog with the database.
|
|
347
|
-
:param endpoint_id: Model endpoint unique ID.
|
|
348
339
|
:param model_path: The model Store path.
|
|
349
340
|
:param model_endpoint_name: Model endpoint name will be presented under the new model endpoint.
|
|
350
341
|
:param function_name: If a new model endpoint is created, use this function name for generating the
|
|
@@ -357,32 +348,38 @@ def _generate_model_endpoint(
|
|
|
357
348
|
|
|
358
349
|
:return `mlrun.model_monitoring.model_endpoint.ModelEndpoint` object.
|
|
359
350
|
"""
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
351
|
+
if not function_name and context:
|
|
352
|
+
function_name = FunctionURI.from_string(
|
|
353
|
+
context.to_dict()["spec"]["function"]
|
|
354
|
+
).function
|
|
355
|
+
model_obj = None
|
|
356
|
+
if model_path:
|
|
357
|
+
model_obj: mlrun.artifacts.ModelArtifact = (
|
|
358
|
+
mlrun.datastore.store_resources.get_store_resource(
|
|
359
|
+
model_path, db=db_session
|
|
360
|
+
)
|
|
368
361
|
)
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
362
|
+
current_time = datetime_now()
|
|
363
|
+
model_endpoint = mlrun.common.schemas.ModelEndpoint(
|
|
364
|
+
metadata=mlrun.common.schemas.ModelEndpointMetadata(
|
|
365
|
+
project=project,
|
|
366
|
+
name=model_endpoint_name,
|
|
367
|
+
endpoint_type=mlrun.common.schemas.model_monitoring.EndpointType.BATCH_EP,
|
|
368
|
+
),
|
|
369
|
+
spec=mlrun.common.schemas.ModelEndpointSpec(
|
|
370
|
+
function_name=function_name,
|
|
371
|
+
model_name=model_obj.metadata.key if model_path else None,
|
|
372
|
+
model_uid=model_obj.metadata.uid if model_path else None,
|
|
373
|
+
model_class="drift-analysis",
|
|
374
|
+
),
|
|
375
|
+
status=mlrun.common.schemas.ModelEndpointStatus(
|
|
376
|
+
monitoring_mode=monitoring_mode,
|
|
377
|
+
first_request=current_time,
|
|
378
|
+
last_request=current_time,
|
|
379
|
+
),
|
|
383
380
|
)
|
|
384
381
|
|
|
385
|
-
return db_session.
|
|
382
|
+
return db_session.create_model_endpoint(model_endpoint=model_endpoint)
|
|
386
383
|
|
|
387
384
|
|
|
388
385
|
def get_sample_set_statistics(
|
|
@@ -16,6 +16,7 @@ import json
|
|
|
16
16
|
import traceback
|
|
17
17
|
from typing import Any, Optional, Union
|
|
18
18
|
|
|
19
|
+
import mlrun.common.schemas
|
|
19
20
|
import mlrun.common.schemas.alert as alert_objects
|
|
20
21
|
import mlrun.common.schemas.model_monitoring.constants as mm_constant
|
|
21
22
|
import mlrun.datastore
|
|
@@ -81,6 +82,7 @@ class _PushToMonitoringWriter(StepToDict):
|
|
|
81
82
|
self._lazy_init()
|
|
82
83
|
application_results, application_context = event
|
|
83
84
|
writer_event = {
|
|
85
|
+
mm_constant.WriterEvent.ENDPOINT_NAME: application_context.endpoint_name,
|
|
84
86
|
mm_constant.WriterEvent.APPLICATION_NAME: application_context.application_name,
|
|
85
87
|
mm_constant.WriterEvent.ENDPOINT_ID: application_context.endpoint_id,
|
|
86
88
|
mm_constant.WriterEvent.START_INFER_TIME: application_context.start_infer_time.isoformat(
|
|
@@ -125,7 +127,7 @@ class _PrepareMonitoringEvent(StepToDict):
|
|
|
125
127
|
"""
|
|
126
128
|
self.graph_context = context
|
|
127
129
|
self.application_name = application_name
|
|
128
|
-
self.model_endpoints: dict[str, mlrun.
|
|
130
|
+
self.model_endpoints: dict[str, mlrun.common.schemas.ModelEndpoint] = {}
|
|
129
131
|
|
|
130
132
|
def do(self, event: dict[str, Any]) -> MonitoringApplicationContext:
|
|
131
133
|
"""
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import json
|
|
16
15
|
import socket
|
|
17
16
|
from typing import Any, Optional, Protocol, cast
|
|
18
17
|
|
|
@@ -28,12 +27,11 @@ import mlrun.features
|
|
|
28
27
|
import mlrun.serving
|
|
29
28
|
import mlrun.utils
|
|
30
29
|
from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
|
|
31
|
-
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
30
|
+
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
31
|
+
from mlrun.common.schemas import ModelEndpoint
|
|
32
32
|
from mlrun.model_monitoring.helpers import (
|
|
33
33
|
calculate_inputs_statistics,
|
|
34
|
-
get_endpoint_record,
|
|
35
34
|
)
|
|
36
|
-
from mlrun.model_monitoring.model_endpoint import ModelEndpoint
|
|
37
35
|
|
|
38
36
|
|
|
39
37
|
class _ArtifactsLogger(Protocol):
|
|
@@ -64,6 +62,7 @@ class MonitoringApplicationContext:
|
|
|
64
62
|
:param end_infer_time: (pd.Timestamp) End time of the monitoring schedule.
|
|
65
63
|
:param latest_request: (pd.Timestamp) Timestamp of the latest request on this endpoint_id.
|
|
66
64
|
:param endpoint_id: (str) ID of the monitored model endpoint
|
|
65
|
+
:param endpoint_name: (str) Name of the monitored model endpoint
|
|
67
66
|
:param output_stream_uri: (str) URI of the output stream for results
|
|
68
67
|
:param model_endpoint: (ModelEndpoint) The model endpoint object.
|
|
69
68
|
:param feature_names: (list[str]) List of models feature names.
|
|
@@ -134,6 +133,9 @@ class MonitoringApplicationContext:
|
|
|
134
133
|
self.endpoint_id = cast(
|
|
135
134
|
str, event.get(mm_constants.ApplicationEvent.ENDPOINT_ID)
|
|
136
135
|
)
|
|
136
|
+
self.endpoint_name = cast(
|
|
137
|
+
str, event.get(mm_constants.ApplicationEvent.ENDPOINT_NAME)
|
|
138
|
+
)
|
|
137
139
|
self.output_stream_uri = cast(
|
|
138
140
|
str, event.get(mm_constants.ApplicationEvent.OUTPUT_STREAM_URI)
|
|
139
141
|
)
|
|
@@ -166,7 +168,7 @@ class MonitoringApplicationContext:
|
|
|
166
168
|
def sample_df(self) -> pd.DataFrame:
|
|
167
169
|
if self._sample_df is None:
|
|
168
170
|
feature_set = fstore.get_feature_set(
|
|
169
|
-
self.model_endpoint.
|
|
171
|
+
self.model_endpoint.spec.monitoring_feature_set_uri
|
|
170
172
|
)
|
|
171
173
|
features = [f"{feature_set.metadata.name}.*"]
|
|
172
174
|
vector = fstore.FeatureVector(
|
|
@@ -188,16 +190,18 @@ class MonitoringApplicationContext:
|
|
|
188
190
|
@property
|
|
189
191
|
def model_endpoint(self) -> ModelEndpoint:
|
|
190
192
|
if not self._model_endpoint:
|
|
191
|
-
self._model_endpoint =
|
|
192
|
-
|
|
193
|
+
self._model_endpoint = mlrun.db.get_run_db().get_model_endpoint(
|
|
194
|
+
name=self.endpoint_name,
|
|
195
|
+
project=self.project_name,
|
|
196
|
+
endpoint_id=self.endpoint_id,
|
|
197
|
+
feature_analysis=True,
|
|
193
198
|
)
|
|
194
199
|
return self._model_endpoint
|
|
195
200
|
|
|
196
201
|
@property
|
|
197
202
|
def feature_stats(self) -> FeatureStats:
|
|
198
203
|
if not self._feature_stats:
|
|
199
|
-
self._feature_stats =
|
|
200
|
-
pad_features_hist(self._feature_stats)
|
|
204
|
+
self._feature_stats = self.model_endpoint.spec.feature_stats
|
|
201
205
|
return self._feature_stats
|
|
202
206
|
|
|
203
207
|
@property
|
|
@@ -212,18 +216,12 @@ class MonitoringApplicationContext:
|
|
|
212
216
|
@property
|
|
213
217
|
def feature_names(self) -> list[str]:
|
|
214
218
|
"""The feature names of the model"""
|
|
215
|
-
|
|
216
|
-
return (
|
|
217
|
-
feature_names
|
|
218
|
-
if isinstance(feature_names, list)
|
|
219
|
-
else json.loads(feature_names)
|
|
220
|
-
)
|
|
219
|
+
return self.model_endpoint.spec.feature_names
|
|
221
220
|
|
|
222
221
|
@property
|
|
223
222
|
def label_names(self) -> list[str]:
|
|
224
223
|
"""The label names of the model"""
|
|
225
|
-
|
|
226
|
-
return label_names if isinstance(label_names, list) else json.loads(label_names)
|
|
224
|
+
return self.model_endpoint.spec.label_names
|
|
227
225
|
|
|
228
226
|
@property
|
|
229
227
|
def model(self) -> tuple[str, ModelArtifact, dict]:
|
|
@@ -19,7 +19,7 @@ import os
|
|
|
19
19
|
from collections.abc import Iterator
|
|
20
20
|
from contextlib import AbstractContextManager
|
|
21
21
|
from types import TracebackType
|
|
22
|
-
from typing import
|
|
22
|
+
from typing import NamedTuple, Optional, cast
|
|
23
23
|
|
|
24
24
|
import nuclio_sdk
|
|
25
25
|
|
|
@@ -27,6 +27,7 @@ import mlrun
|
|
|
27
27
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
28
28
|
import mlrun.feature_store as fstore
|
|
29
29
|
import mlrun.model_monitoring
|
|
30
|
+
from mlrun.common.schemas import EndpointType
|
|
30
31
|
from mlrun.datastore import get_stream_pusher
|
|
31
32
|
from mlrun.errors import err_to_str
|
|
32
33
|
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
@@ -65,7 +66,7 @@ class _BatchWindow:
|
|
|
65
66
|
self._start = self._get_last_analyzed()
|
|
66
67
|
|
|
67
68
|
def _get_saved_last_analyzed(self) -> Optional[int]:
|
|
68
|
-
return self._db.get_application_time(self._application)
|
|
69
|
+
return cast(int, self._db.get_application_time(self._application))
|
|
69
70
|
|
|
70
71
|
def _update_last_analyzed(self, last_analyzed: int) -> None:
|
|
71
72
|
self._db.update_application_time(
|
|
@@ -161,18 +162,20 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
161
162
|
)
|
|
162
163
|
|
|
163
164
|
@classmethod
|
|
164
|
-
def _get_last_updated_time(
|
|
165
|
+
def _get_last_updated_time(
|
|
166
|
+
cls, last_request: datetime.datetime, not_batch_endpoint: bool
|
|
167
|
+
) -> int:
|
|
165
168
|
"""
|
|
166
169
|
Get the last updated time of a model endpoint.
|
|
167
170
|
"""
|
|
168
171
|
last_updated = int(
|
|
169
|
-
|
|
172
|
+
last_request.timestamp()
|
|
170
173
|
- cast(
|
|
171
174
|
float,
|
|
172
175
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
173
176
|
)
|
|
174
177
|
)
|
|
175
|
-
if not
|
|
178
|
+
if not not_batch_endpoint:
|
|
176
179
|
# If the endpoint does not have a stream, `last_updated` should be
|
|
177
180
|
# the minimum between the current time and the last updated time.
|
|
178
181
|
# This compensates for the bumping mechanism - see
|
|
@@ -183,17 +186,13 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
183
186
|
)
|
|
184
187
|
return last_updated
|
|
185
188
|
|
|
186
|
-
@staticmethod
|
|
187
|
-
def _date_string2timestamp(date_string: str) -> int:
|
|
188
|
-
return int(datetime.datetime.fromisoformat(date_string).timestamp())
|
|
189
|
-
|
|
190
189
|
def get_intervals(
|
|
191
190
|
self,
|
|
192
191
|
*,
|
|
193
192
|
application: str,
|
|
194
|
-
first_request:
|
|
195
|
-
last_request:
|
|
196
|
-
|
|
193
|
+
first_request: datetime.datetime,
|
|
194
|
+
last_request: datetime.datetime,
|
|
195
|
+
not_batch_endpoint: bool,
|
|
197
196
|
) -> Iterator[_Interval]:
|
|
198
197
|
"""
|
|
199
198
|
Get the batch window for a specific endpoint and application.
|
|
@@ -204,8 +203,8 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
204
203
|
schedules_file=self._schedules_file,
|
|
205
204
|
application=application,
|
|
206
205
|
timedelta_seconds=self._timedelta,
|
|
207
|
-
last_updated=self._get_last_updated_time(last_request,
|
|
208
|
-
first_request=
|
|
206
|
+
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
207
|
+
first_request=int(first_request.timestamp()),
|
|
209
208
|
)
|
|
210
209
|
yield from batch_window.get_intervals()
|
|
211
210
|
|
|
@@ -235,8 +234,6 @@ class MonitoringApplicationController:
|
|
|
235
234
|
|
|
236
235
|
logger.debug(f"Initializing {self.__class__.__name__}", project=self.project)
|
|
237
236
|
|
|
238
|
-
self.db = mlrun.model_monitoring.get_store_object(project=self.project)
|
|
239
|
-
|
|
240
237
|
self._window_length = _get_window_length()
|
|
241
238
|
|
|
242
239
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
@@ -253,19 +250,16 @@ class MonitoringApplicationController:
|
|
|
253
250
|
return access_key
|
|
254
251
|
|
|
255
252
|
@staticmethod
|
|
256
|
-
def _should_monitor_endpoint(endpoint:
|
|
253
|
+
def _should_monitor_endpoint(endpoint: mlrun.common.schemas.ModelEndpoint) -> bool:
|
|
257
254
|
return (
|
|
258
|
-
# Is the model endpoint active?
|
|
259
|
-
endpoint[mm_constants.EventFieldType.ACTIVE]
|
|
260
255
|
# Is the model endpoint monitored?
|
|
261
|
-
|
|
262
|
-
== mm_constants.ModelMonitoringMode.enabled
|
|
256
|
+
endpoint.status.monitoring_mode == mm_constants.ModelMonitoringMode.enabled
|
|
263
257
|
# Was the model endpoint called? I.e., are the first and last requests nonempty?
|
|
264
|
-
and endpoint
|
|
265
|
-
and endpoint
|
|
258
|
+
and endpoint.status.first_request
|
|
259
|
+
and endpoint.status.last_request
|
|
266
260
|
# Is the model endpoint not a router endpoint? Router endpoint has no feature stats
|
|
267
|
-
and
|
|
268
|
-
!= mm_constants.EndpointType.ROUTER
|
|
261
|
+
and endpoint.metadata.endpoint_type.value
|
|
262
|
+
!= mm_constants.EndpointType.ROUTER.value
|
|
269
263
|
)
|
|
270
264
|
|
|
271
265
|
def run(self) -> None:
|
|
@@ -281,7 +275,10 @@ class MonitoringApplicationController:
|
|
|
281
275
|
logger.info("Start running monitoring controller")
|
|
282
276
|
try:
|
|
283
277
|
applications_names = []
|
|
284
|
-
|
|
278
|
+
endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
|
|
279
|
+
project=self.project, tsdb_metrics=True
|
|
280
|
+
)
|
|
281
|
+
endpoints = endpoints_list.endpoints
|
|
285
282
|
if not endpoints:
|
|
286
283
|
logger.info("No model endpoints found", project=self.project)
|
|
287
284
|
return
|
|
@@ -333,12 +330,19 @@ class MonitoringApplicationController:
|
|
|
333
330
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
334
331
|
storage_options=self.storage_options,
|
|
335
332
|
)
|
|
333
|
+
else:
|
|
334
|
+
logger.debug(
|
|
335
|
+
"Skipping endpoint, not ready or not suitable for monitoring",
|
|
336
|
+
endpoint_id=endpoint.metadata.uid,
|
|
337
|
+
endpoint_name=endpoint.metadata.name,
|
|
338
|
+
)
|
|
339
|
+
logger.info("Finished running monitoring controller")
|
|
336
340
|
|
|
337
341
|
@classmethod
|
|
338
342
|
def model_endpoint_process(
|
|
339
343
|
cls,
|
|
340
344
|
project: str,
|
|
341
|
-
endpoint:
|
|
345
|
+
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
342
346
|
applications_names: list[str],
|
|
343
347
|
window_length: int,
|
|
344
348
|
model_monitoring_access_key: str,
|
|
@@ -356,11 +360,11 @@ class MonitoringApplicationController:
|
|
|
356
360
|
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
357
361
|
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
358
362
|
"""
|
|
359
|
-
endpoint_id = endpoint
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
endpoint[mm_constants.EventFieldType.FEATURE_SET_URI]
|
|
363
|
+
endpoint_id = endpoint.metadata.uid
|
|
364
|
+
not_batch_endpoint = not (
|
|
365
|
+
endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
|
|
363
366
|
)
|
|
367
|
+
m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
|
|
364
368
|
try:
|
|
365
369
|
with _BatchWindowGenerator(
|
|
366
370
|
project=project, endpoint_id=endpoint_id, window_length=window_length
|
|
@@ -371,11 +375,9 @@ class MonitoringApplicationController:
|
|
|
371
375
|
end_infer_time,
|
|
372
376
|
) in batch_window_generator.get_intervals(
|
|
373
377
|
application=application,
|
|
374
|
-
first_request=endpoint
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
last_request=endpoint[mm_constants.EventFieldType.LAST_REQUEST],
|
|
378
|
-
has_stream=has_stream,
|
|
378
|
+
first_request=endpoint.status.first_request,
|
|
379
|
+
last_request=endpoint.status.last_request,
|
|
380
|
+
not_batch_endpoint=not_batch_endpoint,
|
|
379
381
|
):
|
|
380
382
|
df = m_fs.to_dataframe(
|
|
381
383
|
start_time=start_infer_time,
|
|
@@ -401,15 +403,17 @@ class MonitoringApplicationController:
|
|
|
401
403
|
start_infer_time=start_infer_time,
|
|
402
404
|
end_infer_time=end_infer_time,
|
|
403
405
|
endpoint_id=endpoint_id,
|
|
406
|
+
endpoint_name=endpoint.metadata.name,
|
|
404
407
|
project=project,
|
|
405
408
|
applications_names=[application],
|
|
406
409
|
model_monitoring_access_key=model_monitoring_access_key,
|
|
407
410
|
)
|
|
411
|
+
logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
|
|
408
412
|
|
|
409
413
|
except Exception:
|
|
410
414
|
logger.exception(
|
|
411
415
|
"Encountered an exception",
|
|
412
|
-
endpoint_id=endpoint
|
|
416
|
+
endpoint_id=endpoint.metadata.uid,
|
|
413
417
|
)
|
|
414
418
|
|
|
415
419
|
@staticmethod
|
|
@@ -417,6 +421,7 @@ class MonitoringApplicationController:
|
|
|
417
421
|
start_infer_time: datetime.datetime,
|
|
418
422
|
end_infer_time: datetime.datetime,
|
|
419
423
|
endpoint_id: str,
|
|
424
|
+
endpoint_name: str,
|
|
420
425
|
project: str,
|
|
421
426
|
applications_names: list[str],
|
|
422
427
|
model_monitoring_access_key: str,
|
|
@@ -440,6 +445,7 @@ class MonitoringApplicationController:
|
|
|
440
445
|
sep=" ", timespec="microseconds"
|
|
441
446
|
),
|
|
442
447
|
mm_constants.ApplicationEvent.ENDPOINT_ID: endpoint_id,
|
|
448
|
+
mm_constants.ApplicationEvent.ENDPOINT_NAME: endpoint_name,
|
|
443
449
|
mm_constants.ApplicationEvent.OUTPUT_STREAM_URI: get_stream_path(
|
|
444
450
|
project=project,
|
|
445
451
|
function_name=mm_constants.MonitoringFunctionNames.WRITER,
|
|
@@ -12,7 +12,5 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from .stores import ObjectStoreFactory, get_store_object
|
|
16
|
-
from .stores.base import StoreBase
|
|
17
15
|
from .tsdb import get_tsdb_connector
|
|
18
16
|
from .tsdb.base import TSDBConnector
|
|
@@ -47,7 +47,7 @@ class TSDBConnector(ABC):
|
|
|
47
47
|
self.project = project
|
|
48
48
|
|
|
49
49
|
@abstractmethod
|
|
50
|
-
def apply_monitoring_stream_steps(self, graph) -> None:
|
|
50
|
+
def apply_monitoring_stream_steps(self, graph, **kwargs) -> None:
|
|
51
51
|
"""
|
|
52
52
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
53
53
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -294,6 +294,7 @@ class TSDBConnector(ABC):
|
|
|
294
294
|
) -> pd.DataFrame:
|
|
295
295
|
"""
|
|
296
296
|
Fetches data from the predictions TSDB table and returns the average latency for each specified endpoint
|
|
297
|
+
in the provided time range, which by default is the last 24 hours.
|
|
297
298
|
|
|
298
299
|
:param endpoint_ids: A list of model endpoint identifiers.
|
|
299
300
|
:param start: The start time for the query.
|
|
@@ -164,7 +164,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
164
164
|
def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
|
|
165
165
|
return datetime.fromisoformat(val) if isinstance(val, str) else val
|
|
166
166
|
|
|
167
|
-
def apply_monitoring_stream_steps(self, graph):
|
|
167
|
+
def apply_monitoring_stream_steps(self, graph, **kwarg):
|
|
168
168
|
"""
|
|
169
169
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
170
170
|
different key metric dictionaries. This data is being used by the monitoring dashboards in
|
|
@@ -701,6 +701,7 @@ class TDEngineConnector(TSDBConnector):
|
|
|
701
701
|
endpoint_ids = (
|
|
702
702
|
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
703
703
|
)
|
|
704
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
704
705
|
start, end = self._get_start_end(start, end)
|
|
705
706
|
df = self._get_records(
|
|
706
707
|
table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
|
|
@@ -168,6 +168,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
168
168
|
tsdb_batching_max_events: int = 1000,
|
|
169
169
|
tsdb_batching_timeout_secs: int = 30,
|
|
170
170
|
sample_window: int = 10,
|
|
171
|
+
aggregate_windows: Optional[list[str]] = None,
|
|
172
|
+
aggregate_period: str = "1m",
|
|
173
|
+
**kwarg,
|
|
171
174
|
):
|
|
172
175
|
"""
|
|
173
176
|
Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
|
|
@@ -178,7 +181,40 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
178
181
|
- endpoint_features (Prediction and feature names and values)
|
|
179
182
|
- custom_metrics (user-defined metrics)
|
|
180
183
|
"""
|
|
184
|
+
aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
181
185
|
|
|
186
|
+
# Calculate number of predictions and average latency
|
|
187
|
+
def apply_storey_aggregations():
|
|
188
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
189
|
+
graph.add_step(
|
|
190
|
+
class_name="storey.AggregateByKey",
|
|
191
|
+
aggregates=[
|
|
192
|
+
{
|
|
193
|
+
"name": EventFieldType.LATENCY,
|
|
194
|
+
"column": EventFieldType.LATENCY,
|
|
195
|
+
"operations": ["count", "avg"],
|
|
196
|
+
"windows": aggregate_windows,
|
|
197
|
+
"period": aggregate_period,
|
|
198
|
+
}
|
|
199
|
+
],
|
|
200
|
+
name=EventFieldType.LATENCY,
|
|
201
|
+
after="MapFeatureNames",
|
|
202
|
+
step_name="Aggregates",
|
|
203
|
+
table=".",
|
|
204
|
+
key_field=EventFieldType.ENDPOINT_ID,
|
|
205
|
+
)
|
|
206
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
207
|
+
graph.add_step(
|
|
208
|
+
class_name="storey.Rename",
|
|
209
|
+
mapping={
|
|
210
|
+
"latency_count_5m": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_5M,
|
|
211
|
+
"latency_count_1h": mm_schemas.EventLiveStats.PREDICTIONS_COUNT_1H,
|
|
212
|
+
},
|
|
213
|
+
name="Rename",
|
|
214
|
+
after=EventFieldType.LATENCY,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
apply_storey_aggregations()
|
|
182
218
|
# Write latency per prediction, labeled by endpoint ID only
|
|
183
219
|
graph.add_step(
|
|
184
220
|
"storey.TSDBTarget",
|
|
@@ -853,6 +889,7 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
853
889
|
endpoint_ids = (
|
|
854
890
|
endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
|
|
855
891
|
)
|
|
892
|
+
start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
|
|
856
893
|
start, end = self._get_start_end(start, end)
|
|
857
894
|
df = self._get_records(
|
|
858
895
|
table=mm_schemas.FileTargetKind.PREDICTIONS,
|
|
@@ -864,4 +901,10 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
864
901
|
)
|
|
865
902
|
if not df.empty:
|
|
866
903
|
df.dropna(inplace=True)
|
|
904
|
+
df.rename(
|
|
905
|
+
columns={
|
|
906
|
+
f"avg({mm_schemas.EventFieldType.LATENCY})": f"avg_{mm_schemas.EventFieldType.LATENCY}"
|
|
907
|
+
},
|
|
908
|
+
inplace=True,
|
|
909
|
+
)
|
|
867
910
|
return df.reset_index(drop=True)
|