mlrun 1.10.0rc16__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +9 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +2 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +2 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +32 -10
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/datastore.py +9 -4
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +363 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +211 -74
- mlrun/datastore/model_provider/openai_provider.py +243 -71
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +27 -19
- mlrun/db/httpdb.py +57 -48
- mlrun/db/nopdb.py +25 -10
- mlrun/execution.py +55 -13
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +2 -0
- mlrun/model.py +9 -3
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +372 -136
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +36 -13
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
- mlrun/model_monitoring/helpers.py +28 -5
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +6 -1
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +128 -45
- mlrun/run.py +94 -17
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +147 -17
- mlrun/runtimes/nuclio/function.py +70 -27
- mlrun/runtimes/nuclio/serving.py +85 -4
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +54 -13
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +211 -40
- mlrun/serving/states.py +536 -156
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +136 -81
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +212 -82
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +44 -45
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +97 -92
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
|
@@ -24,15 +24,12 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
24
24
|
import mlrun.errors
|
|
25
25
|
import mlrun.feature_store as fstore
|
|
26
26
|
import mlrun.feature_store.feature_set as fs
|
|
27
|
-
import mlrun.features
|
|
28
27
|
import mlrun.serving
|
|
29
28
|
import mlrun.utils
|
|
30
29
|
from mlrun.artifacts import Artifact, DatasetArtifact, ModelArtifact, get_model
|
|
31
30
|
from mlrun.common.model_monitoring.helpers import FeatureStats
|
|
32
31
|
from mlrun.common.schemas import ModelEndpoint
|
|
33
|
-
from mlrun.model_monitoring.helpers import
|
|
34
|
-
calculate_inputs_statistics,
|
|
35
|
-
)
|
|
32
|
+
from mlrun.model_monitoring.helpers import calculate_inputs_statistics
|
|
36
33
|
|
|
37
34
|
|
|
38
35
|
class _ArtifactsLogger(Protocol):
|
|
@@ -252,6 +249,7 @@ class MonitoringApplicationContext:
|
|
|
252
249
|
project=self.project_name,
|
|
253
250
|
endpoint_id=self.endpoint_id,
|
|
254
251
|
feature_analysis=True,
|
|
252
|
+
tsdb_metrics=False,
|
|
255
253
|
)
|
|
256
254
|
return self._model_endpoint
|
|
257
255
|
|
|
@@ -14,16 +14,13 @@
|
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
16
|
import json
|
|
17
|
-
import re
|
|
18
17
|
from abc import ABC, abstractmethod
|
|
19
18
|
|
|
20
19
|
from pydantic.v1 import validator
|
|
21
20
|
from pydantic.v1.dataclasses import dataclass
|
|
22
21
|
|
|
23
|
-
import mlrun.common.helpers
|
|
24
|
-
import mlrun.common.model_monitoring.helpers
|
|
25
22
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
26
|
-
import mlrun.
|
|
23
|
+
import mlrun.errors
|
|
27
24
|
from mlrun.utils import logger
|
|
28
25
|
|
|
29
26
|
_RESULT_EXTRA_DATA_MAX_SIZE = 998
|
|
@@ -33,10 +30,10 @@ class _ModelMonitoringApplicationDataRes(ABC):
|
|
|
33
30
|
name: str
|
|
34
31
|
|
|
35
32
|
def __post_init__(self):
|
|
36
|
-
|
|
37
|
-
if not re.fullmatch(pat, self.name):
|
|
33
|
+
if not mm_constants.RESULT_NAME_REGEX.fullmatch(self.name):
|
|
38
34
|
raise mlrun.errors.MLRunValueError(
|
|
39
|
-
|
|
35
|
+
"The application result or metric name must comply with the regex "
|
|
36
|
+
f"`{mm_constants.RESULT_NAME_REGEX.pattern}`"
|
|
40
37
|
)
|
|
41
38
|
|
|
42
39
|
@abstractmethod
|
|
@@ -11,20 +11,20 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import collections
|
|
16
15
|
import concurrent.futures
|
|
17
16
|
import datetime
|
|
18
17
|
import json
|
|
19
18
|
import os
|
|
20
19
|
import traceback
|
|
21
|
-
|
|
20
|
+
import warnings
|
|
22
21
|
from collections.abc import Iterator
|
|
23
22
|
from contextlib import AbstractContextManager
|
|
24
23
|
from types import TracebackType
|
|
25
|
-
from typing import Any, NamedTuple, Optional, Union, cast
|
|
24
|
+
from typing import Any, Final, NamedTuple, Optional, Union, cast
|
|
26
25
|
|
|
27
26
|
import nuclio_sdk
|
|
27
|
+
import numpy as np
|
|
28
28
|
import pandas as pd
|
|
29
29
|
|
|
30
30
|
import mlrun
|
|
@@ -38,7 +38,6 @@ from mlrun.common.schemas import EndpointType
|
|
|
38
38
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
39
39
|
ControllerEvent,
|
|
40
40
|
ControllerEventEndpointPolicy,
|
|
41
|
-
ControllerEventKind,
|
|
42
41
|
)
|
|
43
42
|
from mlrun.errors import err_to_str
|
|
44
43
|
from mlrun.model_monitoring.helpers import batch_dict2timedelta
|
|
@@ -54,14 +53,17 @@ class _Interval(NamedTuple):
|
|
|
54
53
|
|
|
55
54
|
|
|
56
55
|
class _BatchWindow:
|
|
56
|
+
TIMESTAMP_RESOLUTION_MICRO: Final = 1e-6 # 0.000001 seconds or 1 microsecond
|
|
57
|
+
|
|
57
58
|
def __init__(
|
|
58
59
|
self,
|
|
59
60
|
*,
|
|
60
61
|
schedules_file: schedules.ModelMonitoringSchedulesFileEndpoint,
|
|
61
62
|
application: str,
|
|
62
63
|
timedelta_seconds: int,
|
|
63
|
-
last_updated:
|
|
64
|
-
first_request:
|
|
64
|
+
last_updated: float,
|
|
65
|
+
first_request: float,
|
|
66
|
+
endpoint_mode: mm_constants.EndpointMode = mm_constants.EndpointMode.REAL_TIME,
|
|
65
67
|
) -> None:
|
|
66
68
|
"""
|
|
67
69
|
Initialize a batch window object that handles the batch interval time range
|
|
@@ -74,21 +76,34 @@ class _BatchWindow:
|
|
|
74
76
|
self._stop = last_updated
|
|
75
77
|
self._step = timedelta_seconds
|
|
76
78
|
self._db = schedules_file
|
|
79
|
+
self._endpoint_mode = endpoint_mode
|
|
77
80
|
self._start = self._get_last_analyzed()
|
|
78
81
|
|
|
79
|
-
def _get_saved_last_analyzed(
|
|
80
|
-
|
|
82
|
+
def _get_saved_last_analyzed(
|
|
83
|
+
self,
|
|
84
|
+
) -> Optional[float]:
|
|
85
|
+
return self._db.get_application_time(self._application)
|
|
81
86
|
|
|
82
|
-
def _update_last_analyzed(self, last_analyzed:
|
|
87
|
+
def _update_last_analyzed(self, last_analyzed: float) -> None:
|
|
83
88
|
self._db.update_application_time(
|
|
84
89
|
application=self._application, timestamp=last_analyzed
|
|
85
90
|
)
|
|
86
91
|
|
|
87
|
-
def _get_initial_last_analyzed(self) ->
|
|
92
|
+
def _get_initial_last_analyzed(self) -> float:
|
|
93
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
94
|
+
logger.info(
|
|
95
|
+
"No last analyzed time was found for this endpoint and application, as this is "
|
|
96
|
+
"probably the first time this application is running. Initializing last analyzed "
|
|
97
|
+
"to the start of the batch time",
|
|
98
|
+
application=self._application,
|
|
99
|
+
start_batch_time=self._first_request,
|
|
100
|
+
)
|
|
101
|
+
return self._first_request
|
|
88
102
|
logger.info(
|
|
89
103
|
"No last analyzed time was found for this endpoint and application, as this is "
|
|
90
104
|
"probably the first time this application is running. Initializing last analyzed "
|
|
91
|
-
"to the latest between first request
|
|
105
|
+
"to the latest between first request the latest between first request time or last "
|
|
106
|
+
"update time minus one day",
|
|
92
107
|
application=self._application,
|
|
93
108
|
first_request=self._first_request,
|
|
94
109
|
last_updated=self._stop,
|
|
@@ -100,9 +115,12 @@ class _BatchWindow:
|
|
|
100
115
|
self._stop - first_period_in_seconds,
|
|
101
116
|
)
|
|
102
117
|
|
|
103
|
-
def _get_last_analyzed(self) ->
|
|
118
|
+
def _get_last_analyzed(self) -> float:
|
|
104
119
|
saved_last_analyzed = self._get_saved_last_analyzed()
|
|
105
120
|
if saved_last_analyzed is not None:
|
|
121
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
122
|
+
# Use the maximum between the saved last analyzed and the start of the batch
|
|
123
|
+
return max(saved_last_analyzed, self._first_request)
|
|
106
124
|
return saved_last_analyzed
|
|
107
125
|
else:
|
|
108
126
|
last_analyzed = self._get_initial_last_analyzed()
|
|
@@ -113,16 +131,20 @@ class _BatchWindow:
|
|
|
113
131
|
def get_intervals(self) -> Iterator[_Interval]:
|
|
114
132
|
"""Generate the batch interval time ranges."""
|
|
115
133
|
entered = False
|
|
134
|
+
last_analyzed = None
|
|
116
135
|
# Iterate timestamp from start until timestamp <= stop - step
|
|
117
136
|
# so that the last interval will end at (timestamp + step) <= stop.
|
|
118
137
|
# Add 1 to stop - step to get <= and not <.
|
|
119
|
-
for timestamp in
|
|
138
|
+
for timestamp in np.arange(
|
|
139
|
+
self._start, self._stop - self._step + 1, self._step
|
|
140
|
+
):
|
|
120
141
|
entered = True
|
|
121
142
|
start_time = datetime.datetime.fromtimestamp(
|
|
122
143
|
timestamp, tz=datetime.timezone.utc
|
|
123
144
|
)
|
|
124
145
|
end_time = datetime.datetime.fromtimestamp(
|
|
125
|
-
timestamp + self._step,
|
|
146
|
+
timestamp - self.TIMESTAMP_RESOLUTION_MICRO + self._step,
|
|
147
|
+
tz=datetime.timezone.utc,
|
|
126
148
|
)
|
|
127
149
|
yield _Interval(start_time, end_time)
|
|
128
150
|
|
|
@@ -134,6 +156,40 @@ class _BatchWindow:
|
|
|
134
156
|
last_analyzed=last_analyzed,
|
|
135
157
|
)
|
|
136
158
|
|
|
159
|
+
if self._endpoint_mode == mm_constants.EndpointMode.BATCH:
|
|
160
|
+
# If the endpoint is a batch endpoint, we need to update the last analyzed time
|
|
161
|
+
# to the end of the batch time.
|
|
162
|
+
if last_analyzed:
|
|
163
|
+
if last_analyzed - self.TIMESTAMP_RESOLUTION_MICRO < self._stop:
|
|
164
|
+
# If the last analyzed time is earlier than the stop time,
|
|
165
|
+
# yield the final partial interval from last_analyzed to stop
|
|
166
|
+
yield _Interval(
|
|
167
|
+
datetime.datetime.fromtimestamp(
|
|
168
|
+
last_analyzed, tz=datetime.timezone.utc
|
|
169
|
+
),
|
|
170
|
+
datetime.datetime.fromtimestamp(
|
|
171
|
+
self._stop, tz=datetime.timezone.utc
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
# The time span between the start and end of the batch is shorter than the step,
|
|
176
|
+
# so we need to yield a partial interval covering that range.
|
|
177
|
+
yield _Interval(
|
|
178
|
+
datetime.datetime.fromtimestamp(
|
|
179
|
+
self._start, tz=datetime.timezone.utc
|
|
180
|
+
),
|
|
181
|
+
datetime.datetime.fromtimestamp(
|
|
182
|
+
self._stop, tz=datetime.timezone.utc
|
|
183
|
+
),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
self._update_last_analyzed(last_analyzed=self._stop)
|
|
187
|
+
logger.debug(
|
|
188
|
+
"Updated the last analyzed time for this endpoint and application to the end of the batch time",
|
|
189
|
+
application=self._application,
|
|
190
|
+
last_analyzed=self._stop,
|
|
191
|
+
)
|
|
192
|
+
|
|
137
193
|
if not entered:
|
|
138
194
|
logger.debug(
|
|
139
195
|
"All the data is set, but no complete intervals were found. "
|
|
@@ -178,33 +234,37 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
178
234
|
def get_application_list(self) -> set[str]:
|
|
179
235
|
return self._schedules_file.get_application_list()
|
|
180
236
|
|
|
181
|
-
def get_min_last_analyzed(self) -> Optional[
|
|
237
|
+
def get_min_last_analyzed(self) -> Optional[float]:
|
|
182
238
|
return self._schedules_file.get_min_timestamp()
|
|
183
239
|
|
|
184
240
|
@classmethod
|
|
185
241
|
def _get_last_updated_time(
|
|
186
|
-
cls,
|
|
187
|
-
|
|
242
|
+
cls,
|
|
243
|
+
last_request: datetime.datetime,
|
|
244
|
+
endpoint_mode: mm_constants.EndpointMode,
|
|
245
|
+
not_old_batch_endpoint: bool,
|
|
246
|
+
) -> float:
|
|
188
247
|
"""
|
|
189
248
|
Get the last updated time of a model endpoint.
|
|
190
249
|
"""
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
- cast(
|
|
250
|
+
|
|
251
|
+
if endpoint_mode == mm_constants.EndpointMode.REAL_TIME:
|
|
252
|
+
last_updated = last_request.timestamp() - cast(
|
|
194
253
|
float,
|
|
195
254
|
mlrun.mlconf.model_endpoint_monitoring.parquet_batching_timeout_secs,
|
|
196
255
|
)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
256
|
+
if not not_old_batch_endpoint:
|
|
257
|
+
# If the endpoint does not have a stream, `last_updated` should be
|
|
258
|
+
# the minimum between the current time and the last updated time.
|
|
259
|
+
# This compensates for the bumping mechanism - see
|
|
260
|
+
# `update_model_endpoint_last_request`.
|
|
261
|
+
last_updated = min(datetime_now().timestamp(), last_updated)
|
|
262
|
+
logger.debug(
|
|
263
|
+
"The endpoint does not have a stream", last_updated=last_updated
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return last_updated
|
|
267
|
+
return last_request.timestamp()
|
|
208
268
|
|
|
209
269
|
def get_intervals(
|
|
210
270
|
self,
|
|
@@ -212,19 +272,24 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
212
272
|
application: str,
|
|
213
273
|
first_request: datetime.datetime,
|
|
214
274
|
last_request: datetime.datetime,
|
|
215
|
-
|
|
275
|
+
endpoint_mode: mm_constants.EndpointMode,
|
|
276
|
+
not_old_batch_endpoint: bool,
|
|
216
277
|
) -> Iterator[_Interval]:
|
|
217
278
|
"""
|
|
218
279
|
Get the batch window for a specific endpoint and application.
|
|
219
280
|
`first_request` and `last_request` are the timestamps of the first request and last
|
|
220
281
|
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
221
282
|
"""
|
|
283
|
+
|
|
222
284
|
self.batch_window = _BatchWindow(
|
|
223
285
|
schedules_file=self._schedules_file,
|
|
224
286
|
application=application,
|
|
225
287
|
timedelta_seconds=self._timedelta,
|
|
226
|
-
last_updated=self._get_last_updated_time(
|
|
227
|
-
|
|
288
|
+
last_updated=self._get_last_updated_time(
|
|
289
|
+
last_request, endpoint_mode, not_old_batch_endpoint
|
|
290
|
+
),
|
|
291
|
+
first_request=first_request.timestamp(),
|
|
292
|
+
endpoint_mode=endpoint_mode,
|
|
228
293
|
)
|
|
229
294
|
yield from self.batch_window.get_intervals()
|
|
230
295
|
|
|
@@ -282,9 +347,9 @@ class MonitoringApplicationController:
|
|
|
282
347
|
mlrun.platforms.iguazio.KafkaOutputStream,
|
|
283
348
|
],
|
|
284
349
|
] = {}
|
|
285
|
-
self.feature_sets: OrderedDict[
|
|
286
|
-
|
|
287
|
-
)
|
|
350
|
+
self.feature_sets: collections.OrderedDict[
|
|
351
|
+
str, mlrun.feature_store.FeatureSet
|
|
352
|
+
] = collections.OrderedDict()
|
|
288
353
|
self.tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
289
354
|
project=self.project
|
|
290
355
|
)
|
|
@@ -394,15 +459,14 @@ class MonitoringApplicationController:
|
|
|
394
459
|
base_period_minutes, current_min_last_analyzed, current_time
|
|
395
460
|
)
|
|
396
461
|
and (
|
|
397
|
-
|
|
398
|
-
!= last_timestamp_sent
|
|
462
|
+
endpoint.status.last_request.timestamp() != last_timestamp_sent
|
|
399
463
|
or current_min_last_analyzed != last_analyzed_sent
|
|
400
464
|
)
|
|
401
465
|
):
|
|
402
466
|
# Write to schedule chief file the last_request, min_last_analyzed we pushed event to stream
|
|
403
467
|
schedules_file.update_endpoint_timestamps(
|
|
404
468
|
endpoint_uid=endpoint.metadata.uid,
|
|
405
|
-
last_request=
|
|
469
|
+
last_request=endpoint.status.last_request.timestamp(),
|
|
406
470
|
last_analyzed=current_min_last_analyzed,
|
|
407
471
|
)
|
|
408
472
|
return True
|
|
@@ -428,7 +492,7 @@ class MonitoringApplicationController:
|
|
|
428
492
|
@staticmethod
|
|
429
493
|
def _should_send_nop_event(
|
|
430
494
|
base_period_minutes: int,
|
|
431
|
-
min_last_analyzed:
|
|
495
|
+
min_last_analyzed: float,
|
|
432
496
|
current_time: datetime.datetime,
|
|
433
497
|
):
|
|
434
498
|
if min_last_analyzed:
|
|
@@ -477,24 +541,71 @@ class MonitoringApplicationController:
|
|
|
477
541
|
try:
|
|
478
542
|
project_name = event[ControllerEvent.PROJECT]
|
|
479
543
|
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
544
|
+
not_old_batch_endpoint = True
|
|
545
|
+
if (
|
|
546
|
+
event[ControllerEvent.KIND]
|
|
547
|
+
== mm_constants.ControllerEventKind.BATCH_COMPLETE
|
|
548
|
+
):
|
|
549
|
+
monitoring_functions = (
|
|
550
|
+
self.project_obj.list_model_monitoring_functions()
|
|
551
|
+
)
|
|
552
|
+
if monitoring_functions:
|
|
553
|
+
applications_names = list(
|
|
554
|
+
{app.metadata.name for app in monitoring_functions}
|
|
555
|
+
)
|
|
556
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
557
|
+
event[ControllerEvent.LAST_TIMESTAMP]
|
|
558
|
+
)
|
|
559
|
+
first_request = datetime.datetime.fromisoformat(
|
|
560
|
+
event[ControllerEvent.FIRST_TIMESTAMP]
|
|
561
|
+
)
|
|
562
|
+
endpoint_mode = mm_constants.EndpointMode.BATCH
|
|
563
|
+
model_endpoint = self.project_obj.list_model_endpoints(
|
|
564
|
+
uids=[endpoint_id],
|
|
565
|
+
latest_only=True,
|
|
566
|
+
).endpoints
|
|
484
567
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
568
|
+
if not model_endpoint:
|
|
569
|
+
logger.error(
|
|
570
|
+
"Batch model endpoint not found",
|
|
571
|
+
endpoint_id=endpoint_id,
|
|
572
|
+
project=project_name,
|
|
573
|
+
)
|
|
574
|
+
return
|
|
575
|
+
|
|
576
|
+
endpoint_name = model_endpoint[0].metadata.name
|
|
577
|
+
endpoint_updated = model_endpoint[0].metadata.updated.isoformat()
|
|
578
|
+
|
|
579
|
+
else:
|
|
580
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
581
|
+
return
|
|
582
|
+
|
|
583
|
+
else:
|
|
584
|
+
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
585
|
+
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
586
|
+
ControllerEventEndpointPolicy.MONITORING_APPLICATIONS
|
|
587
|
+
]
|
|
588
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
589
|
+
event[ControllerEvent.TIMESTAMP]
|
|
590
|
+
)
|
|
591
|
+
first_request = datetime.datetime.fromisoformat(
|
|
592
|
+
event[ControllerEvent.FIRST_REQUEST]
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
endpoint_updated = event[ControllerEvent.ENDPOINT_POLICY][
|
|
596
|
+
ControllerEventEndpointPolicy.ENDPOINT_UPDATED
|
|
597
|
+
]
|
|
598
|
+
|
|
599
|
+
endpoint_mode = mm_constants.EndpointMode.REAL_TIME
|
|
600
|
+
|
|
601
|
+
not_old_batch_endpoint = (
|
|
602
|
+
event[ControllerEvent.ENDPOINT_TYPE] != EndpointType.BATCH_EP
|
|
603
|
+
)
|
|
488
604
|
|
|
489
605
|
logger.info(
|
|
490
|
-
"Starting
|
|
491
|
-
)
|
|
492
|
-
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
493
|
-
event[ControllerEvent.TIMESTAMP]
|
|
494
|
-
)
|
|
495
|
-
first_request = datetime.datetime.fromisoformat(
|
|
496
|
-
event[ControllerEvent.FIRST_REQUEST]
|
|
606
|
+
"Starting to analyze", timestamp=last_stream_timestamp.isoformat()
|
|
497
607
|
)
|
|
608
|
+
|
|
498
609
|
with _BatchWindowGenerator(
|
|
499
610
|
project=project_name,
|
|
500
611
|
endpoint_id=endpoint_id,
|
|
@@ -506,12 +617,13 @@ class MonitoringApplicationController:
|
|
|
506
617
|
end_infer_time,
|
|
507
618
|
) in batch_window_generator.get_intervals(
|
|
508
619
|
application=application,
|
|
509
|
-
not_batch_endpoint=not_batch_endpoint,
|
|
510
620
|
first_request=first_request,
|
|
511
621
|
last_request=last_stream_timestamp,
|
|
622
|
+
endpoint_mode=endpoint_mode,
|
|
623
|
+
not_old_batch_endpoint=not_old_batch_endpoint,
|
|
512
624
|
):
|
|
513
625
|
data_in_window = False
|
|
514
|
-
if
|
|
626
|
+
if not_old_batch_endpoint:
|
|
515
627
|
# Serving endpoint - get the relevant window data from the TSDB
|
|
516
628
|
prediction_metric = self.tsdb_connector.read_predictions(
|
|
517
629
|
start=start_infer_time,
|
|
@@ -521,6 +633,16 @@ class MonitoringApplicationController:
|
|
|
521
633
|
if prediction_metric.data:
|
|
522
634
|
data_in_window = True
|
|
523
635
|
else:
|
|
636
|
+
# Old batch endpoint - get the relevant window data from the parquet target
|
|
637
|
+
warnings.warn(
|
|
638
|
+
"Analyzing batch model endpoints with real time processing events is "
|
|
639
|
+
"deprecated in 1.10.0 and will be removed in 1.12.0. "
|
|
640
|
+
"Instead, use job-based serving to invoke and analyze offline batch model"
|
|
641
|
+
"endpoints.",
|
|
642
|
+
# TODO: Remove this in 1.12.0
|
|
643
|
+
FutureWarning,
|
|
644
|
+
)
|
|
645
|
+
|
|
524
646
|
if endpoint_id not in self.feature_sets:
|
|
525
647
|
self.feature_sets[endpoint_id] = fstore.get_feature_set(
|
|
526
648
|
event[ControllerEvent.FEATURE_SET_URI]
|
|
@@ -533,7 +655,6 @@ class MonitoringApplicationController:
|
|
|
533
655
|
self.feature_sets.popitem(last=True)
|
|
534
656
|
m_fs = self.feature_sets.get(endpoint_id)
|
|
535
657
|
|
|
536
|
-
# Batch endpoint - get the relevant window data from the parquet target
|
|
537
658
|
df = m_fs.to_dataframe(
|
|
538
659
|
start_time=start_infer_time,
|
|
539
660
|
end_time=end_infer_time,
|
|
@@ -542,6 +663,7 @@ class MonitoringApplicationController:
|
|
|
542
663
|
)
|
|
543
664
|
if len(df) > 0:
|
|
544
665
|
data_in_window = True
|
|
666
|
+
|
|
545
667
|
if not data_in_window:
|
|
546
668
|
logger.info(
|
|
547
669
|
"No data found for the given interval",
|
|
@@ -557,56 +679,60 @@ class MonitoringApplicationController:
|
|
|
557
679
|
endpoint_id=endpoint_id,
|
|
558
680
|
)
|
|
559
681
|
self._push_to_applications(
|
|
560
|
-
start_infer_time=start_infer_time
|
|
682
|
+
start_infer_time=start_infer_time
|
|
683
|
+
- datetime.timedelta(
|
|
684
|
+
batch_window_generator.batch_window.TIMESTAMP_RESOLUTION_MICRO
|
|
685
|
+
), # We subtract a microsecond to ensure that the apps will retrieve start time data.
|
|
561
686
|
end_infer_time=end_infer_time,
|
|
562
687
|
endpoint_id=endpoint_id,
|
|
563
688
|
endpoint_name=endpoint_name,
|
|
564
689
|
project=project_name,
|
|
565
690
|
applications_names=[application],
|
|
566
691
|
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
567
|
-
endpoint_updated=
|
|
568
|
-
ControllerEventEndpointPolicy.ENDPOINT_UPDATED
|
|
569
|
-
],
|
|
692
|
+
endpoint_updated=endpoint_updated,
|
|
570
693
|
)
|
|
571
|
-
|
|
572
|
-
ControllerEventEndpointPolicy.BASE_PERIOD
|
|
573
|
-
]
|
|
574
|
-
current_time = mlrun.utils.datetime_now()
|
|
694
|
+
|
|
575
695
|
if (
|
|
576
|
-
|
|
696
|
+
event[ControllerEvent.KIND]
|
|
697
|
+
== mm_constants.ControllerEventKind.REGULAR_EVENT
|
|
698
|
+
):
|
|
699
|
+
base_period = event[ControllerEvent.ENDPOINT_POLICY][
|
|
700
|
+
ControllerEventEndpointPolicy.BASE_PERIOD
|
|
701
|
+
]
|
|
702
|
+
current_time = mlrun.utils.datetime_now()
|
|
703
|
+
if self._should_send_nop_event(
|
|
577
704
|
base_period,
|
|
578
705
|
batch_window_generator.get_min_last_analyzed(),
|
|
579
706
|
current_time,
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
ControllerEvent.
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
ControllerEvent.
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
ControllerEvent.
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
endpoint_id=endpoint_id,
|
|
607
|
-
)
|
|
707
|
+
):
|
|
708
|
+
event = {
|
|
709
|
+
ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
|
|
710
|
+
ControllerEvent.PROJECT: project_name,
|
|
711
|
+
ControllerEvent.ENDPOINT_ID: endpoint_id,
|
|
712
|
+
ControllerEvent.ENDPOINT_NAME: endpoint_name,
|
|
713
|
+
ControllerEvent.TIMESTAMP: current_time.isoformat(
|
|
714
|
+
timespec="microseconds"
|
|
715
|
+
),
|
|
716
|
+
ControllerEvent.ENDPOINT_POLICY: event[
|
|
717
|
+
ControllerEvent.ENDPOINT_POLICY
|
|
718
|
+
],
|
|
719
|
+
ControllerEvent.ENDPOINT_TYPE: event[
|
|
720
|
+
ControllerEvent.ENDPOINT_TYPE
|
|
721
|
+
],
|
|
722
|
+
ControllerEvent.FEATURE_SET_URI: event[
|
|
723
|
+
ControllerEvent.FEATURE_SET_URI
|
|
724
|
+
],
|
|
725
|
+
ControllerEvent.FIRST_REQUEST: event[
|
|
726
|
+
ControllerEvent.FIRST_REQUEST
|
|
727
|
+
],
|
|
728
|
+
}
|
|
729
|
+
self._push_to_main_stream(
|
|
730
|
+
event=event,
|
|
731
|
+
endpoint_id=endpoint_id,
|
|
732
|
+
)
|
|
608
733
|
logger.info(
|
|
609
|
-
"Finish analyze for",
|
|
734
|
+
"Finish analyze for",
|
|
735
|
+
timestamp=last_stream_timestamp,
|
|
610
736
|
)
|
|
611
737
|
|
|
612
738
|
except Exception:
|
|
@@ -674,7 +800,18 @@ class MonitoringApplicationController:
|
|
|
674
800
|
"""
|
|
675
801
|
logger.info("Starting monitoring controller chief")
|
|
676
802
|
applications_names = []
|
|
677
|
-
endpoints = self.project_obj.list_model_endpoints(
|
|
803
|
+
endpoints = self.project_obj.list_model_endpoints(
|
|
804
|
+
tsdb_metrics=False,
|
|
805
|
+
modes=[
|
|
806
|
+
mm_constants.EndpointMode.REAL_TIME,
|
|
807
|
+
mm_constants.EndpointMode.BATCH_LEGACY,
|
|
808
|
+
],
|
|
809
|
+
).endpoints
|
|
810
|
+
|
|
811
|
+
if not endpoints:
|
|
812
|
+
logger.info("No model endpoints found", project=self.project)
|
|
813
|
+
return
|
|
814
|
+
|
|
678
815
|
last_request_dict = self.tsdb_connector.get_last_request(
|
|
679
816
|
endpoint_ids=[mep.metadata.uid for mep in endpoints]
|
|
680
817
|
)
|
|
@@ -683,9 +820,6 @@ class MonitoringApplicationController:
|
|
|
683
820
|
mm_constants.EventFieldType.ENDPOINT_ID
|
|
684
821
|
)[mm_constants.ModelEndpointSchema.LAST_REQUEST].to_dict()
|
|
685
822
|
|
|
686
|
-
if not endpoints:
|
|
687
|
-
logger.info("No model endpoints found", project=self.project)
|
|
688
|
-
return
|
|
689
823
|
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
690
824
|
if monitoring_functions:
|
|
691
825
|
# if monitoring_functions: - TODO : ML-7700
|
|
@@ -731,7 +865,11 @@ class MonitoringApplicationController:
|
|
|
731
865
|
for endpoint in endpoints:
|
|
732
866
|
last_request = last_request_dict.get(endpoint.metadata.uid, None)
|
|
733
867
|
if isinstance(last_request, float):
|
|
734
|
-
last_request =
|
|
868
|
+
last_request = datetime.datetime.fromtimestamp(
|
|
869
|
+
last_request, tz=datetime.timezone.utc
|
|
870
|
+
)
|
|
871
|
+
elif isinstance(last_request, pd.Timestamp):
|
|
872
|
+
last_request = last_request.to_pydatetime()
|
|
735
873
|
endpoint.status.last_request = (
|
|
736
874
|
last_request or endpoint.status.last_request
|
|
737
875
|
)
|