mlrun 1.8.0rc21__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +37 -3
- mlrun/alerts/alert.py +1 -0
- mlrun/artifacts/document.py +78 -36
- mlrun/common/formatters/feature_set.py +1 -0
- mlrun/common/schemas/alert.py +3 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/model_monitoring/constants.py +27 -9
- mlrun/common/schemas/workflow.py +1 -0
- mlrun/config.py +39 -6
- mlrun/datastore/datastore_profile.py +58 -16
- mlrun/datastore/sources.py +7 -1
- mlrun/datastore/vectorstore.py +20 -1
- mlrun/db/base.py +11 -0
- mlrun/db/httpdb.py +21 -9
- mlrun/db/nopdb.py +10 -0
- mlrun/errors.py +4 -0
- mlrun/execution.py +15 -6
- mlrun/launcher/client.py +2 -2
- mlrun/launcher/local.py +5 -1
- mlrun/model_monitoring/applications/_application_steps.py +3 -1
- mlrun/model_monitoring/controller.py +266 -103
- mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +5 -2
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +8 -8
- mlrun/model_monitoring/helpers.py +16 -10
- mlrun/model_monitoring/stream_processing.py +85 -35
- mlrun/package/context_handler.py +1 -1
- mlrun/package/packagers_manager.py +4 -18
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +123 -38
- mlrun/runtimes/nuclio/serving.py +2 -2
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/secrets.py +1 -1
- mlrun/serving/server.py +11 -3
- mlrun/serving/states.py +65 -8
- mlrun/serving/v2_serving.py +16 -8
- mlrun/utils/helpers.py +81 -21
- mlrun/utils/notifications/notification/base.py +6 -1
- mlrun/utils/notifications/notification/slack.py +5 -1
- mlrun/utils/notifications/notification_pusher.py +13 -4
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +47 -47
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
- {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0
mlrun/db/httpdb.py
CHANGED
|
@@ -559,14 +559,6 @@ class HTTPRunDB(RunDBInterface):
|
|
|
559
559
|
server_cfg.get("external_platform_tracking")
|
|
560
560
|
or config.external_platform_tracking
|
|
561
561
|
)
|
|
562
|
-
config.model_endpoint_monitoring.tsdb_connection = (
|
|
563
|
-
server_cfg.get("model_monitoring_tsdb_connection")
|
|
564
|
-
or config.model_endpoint_monitoring.tsdb_connection
|
|
565
|
-
)
|
|
566
|
-
config.model_endpoint_monitoring.stream_connection = (
|
|
567
|
-
server_cfg.get("stream_connection")
|
|
568
|
-
or config.model_endpoint_monitoring.stream_connection
|
|
569
|
-
)
|
|
570
562
|
config.packagers = server_cfg.get("packagers") or config.packagers
|
|
571
563
|
server_data_prefixes = server_cfg.get("feature_store_data_prefixes") or {}
|
|
572
564
|
for prefix in ["default", "nosql", "redisnosql"]:
|
|
@@ -771,7 +763,6 @@ class HTTPRunDB(RunDBInterface):
|
|
|
771
763
|
:returns: :py:class:`~mlrun.common.schemas.BackgroundTask`.
|
|
772
764
|
"""
|
|
773
765
|
project = project or config.default_project
|
|
774
|
-
|
|
775
766
|
response = self.api_call(
|
|
776
767
|
"POST",
|
|
777
768
|
path=f"projects/{project}/runs/{uid}/push-notifications",
|
|
@@ -5030,6 +5021,27 @@ class HTTPRunDB(RunDBInterface):
|
|
|
5030
5021
|
**kwargs,
|
|
5031
5022
|
)
|
|
5032
5023
|
|
|
5024
|
+
def get_alert_activation(
|
|
5025
|
+
self,
|
|
5026
|
+
project,
|
|
5027
|
+
activation_id,
|
|
5028
|
+
) -> mlrun.common.schemas.AlertActivation:
|
|
5029
|
+
"""
|
|
5030
|
+
Retrieve the alert activation by id
|
|
5031
|
+
|
|
5032
|
+
:param project: Project name for which the summary belongs.
|
|
5033
|
+
:param activation_id: alert activation id.
|
|
5034
|
+
:returns: alert activation object.
|
|
5035
|
+
"""
|
|
5036
|
+
project = project or config.default_project
|
|
5037
|
+
|
|
5038
|
+
error = "get alert activation"
|
|
5039
|
+
path = f"projects/{project}/alert-activations/{activation_id}"
|
|
5040
|
+
|
|
5041
|
+
response = self.api_call("GET", path, error)
|
|
5042
|
+
|
|
5043
|
+
return mlrun.common.schemas.AlertActivation(**response.json())
|
|
5044
|
+
|
|
5033
5045
|
def get_project_summary(
|
|
5034
5046
|
self, project: Optional[str] = None
|
|
5035
5047
|
) -> mlrun.common.schemas.ProjectSummary:
|
mlrun/db/nopdb.py
CHANGED
|
@@ -84,6 +84,9 @@ class NopDB(RunDBInterface):
|
|
|
84
84
|
):
|
|
85
85
|
pass
|
|
86
86
|
|
|
87
|
+
def refresh_smtp_configuration(self):
|
|
88
|
+
pass
|
|
89
|
+
|
|
87
90
|
def push_pipeline_notifications(
|
|
88
91
|
self,
|
|
89
92
|
pipeline_id,
|
|
@@ -945,5 +948,12 @@ class NopDB(RunDBInterface):
|
|
|
945
948
|
):
|
|
946
949
|
pass
|
|
947
950
|
|
|
951
|
+
def get_alert_activation(
|
|
952
|
+
self,
|
|
953
|
+
project,
|
|
954
|
+
activation_id,
|
|
955
|
+
) -> mlrun.common.schemas.AlertActivation:
|
|
956
|
+
pass
|
|
957
|
+
|
|
948
958
|
def get_project_summary(self, project: str):
|
|
949
959
|
pass
|
mlrun/errors.py
CHANGED
|
@@ -174,6 +174,10 @@ class MLRunInvalidArgumentError(MLRunHTTPStatusError, ValueError):
|
|
|
174
174
|
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
175
175
|
|
|
176
176
|
|
|
177
|
+
class MLRunModelLimitExceededError(MLRunHTTPStatusError, ValueError):
|
|
178
|
+
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
179
|
+
|
|
180
|
+
|
|
177
181
|
class MLRunInvalidArgumentTypeError(MLRunHTTPStatusError, TypeError):
|
|
178
182
|
error_status_code = HTTPStatus.BAD_REQUEST.value
|
|
179
183
|
|
mlrun/execution.py
CHANGED
|
@@ -876,7 +876,7 @@ class MLClientCtx:
|
|
|
876
876
|
|
|
877
877
|
def log_document(
|
|
878
878
|
self,
|
|
879
|
-
key: str,
|
|
879
|
+
key: str = "",
|
|
880
880
|
tag: str = "",
|
|
881
881
|
local_path: str = "",
|
|
882
882
|
artifact_path: Optional[str] = None,
|
|
@@ -890,7 +890,8 @@ class MLClientCtx:
|
|
|
890
890
|
"""
|
|
891
891
|
Log a document as an artifact.
|
|
892
892
|
|
|
893
|
-
:param key:
|
|
893
|
+
:param key: Optional artifact key. If not provided, will be derived from local_path
|
|
894
|
+
or target_path using DocumentArtifact.key_from_source()
|
|
894
895
|
:param tag: Version tag
|
|
895
896
|
:param local_path: path to the local file we upload, will also be use
|
|
896
897
|
as the destination subpath (under "artifact_path")
|
|
@@ -923,7 +924,6 @@ class MLClientCtx:
|
|
|
923
924
|
Example:
|
|
924
925
|
>>> # Log a PDF document with custom loader
|
|
925
926
|
>>> project.log_document(
|
|
926
|
-
... key="my_doc",
|
|
927
927
|
... local_path="path/to/doc.pdf",
|
|
928
928
|
... document_loader_spec=DocumentLoaderSpec(
|
|
929
929
|
... loader_class_name="langchain_community.document_loaders.PDFLoader",
|
|
@@ -932,10 +932,19 @@ class MLClientCtx:
|
|
|
932
932
|
... ),
|
|
933
933
|
... )
|
|
934
934
|
"""
|
|
935
|
+
|
|
936
|
+
if not key and not local_path and not target_path:
|
|
937
|
+
raise ValueError(
|
|
938
|
+
"Must provide either 'key' parameter or 'local_path'/'target_path' to derive the key from"
|
|
939
|
+
)
|
|
940
|
+
if not key:
|
|
941
|
+
key = DocumentArtifact.key_from_source(local_path or target_path)
|
|
942
|
+
|
|
935
943
|
doc_artifact = DocumentArtifact(
|
|
936
944
|
key=key,
|
|
937
945
|
original_source=local_path or target_path,
|
|
938
946
|
document_loader_spec=document_loader_spec,
|
|
947
|
+
collections=kwargs.pop("collections", None),
|
|
939
948
|
**kwargs,
|
|
940
949
|
)
|
|
941
950
|
|
|
@@ -964,12 +973,12 @@ class MLClientCtx:
|
|
|
964
973
|
def get_artifact(
|
|
965
974
|
self, key, tag=None, iter=None, tree=None, uid=None
|
|
966
975
|
) -> Optional[Artifact]:
|
|
967
|
-
|
|
976
|
+
cached_artifact_uri = self._artifacts_manager.artifact_uris.get(key, None)
|
|
977
|
+
if tag or iter or tree or uid or (not cached_artifact_uri):
|
|
968
978
|
project = self.get_project_object()
|
|
969
979
|
return project.get_artifact(key=key, tag=tag, iter=iter, tree=tree, uid=uid)
|
|
970
980
|
else:
|
|
971
|
-
|
|
972
|
-
return self.get_store_resource(artifact_uri)
|
|
981
|
+
return self.get_store_resource(cached_artifact_uri)
|
|
973
982
|
|
|
974
983
|
def update_artifact(self, artifact_object: Artifact):
|
|
975
984
|
"""Update an artifact object in the DB and the cached uri"""
|
mlrun/launcher/client.py
CHANGED
|
@@ -134,7 +134,7 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
|
|
|
134
134
|
if mlrun.utils.is_jupyter and mlrun.mlconf.ipython_widget:
|
|
135
135
|
results_tbl.show()
|
|
136
136
|
print()
|
|
137
|
-
ui_url = mlrun.utils.
|
|
137
|
+
ui_url = mlrun.utils.get_run_url(project, uid=uid, name=run.metadata.name)
|
|
138
138
|
if ui_url:
|
|
139
139
|
ui_url = f' or <a href="{ui_url}" target="_blank">click here</a> to open in UI'
|
|
140
140
|
IPython.display.display(
|
|
@@ -150,6 +150,6 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
|
|
|
150
150
|
mlrun.utils.logger.info(
|
|
151
151
|
"To track results use the CLI", info_cmd=info_cmd, logs_cmd=logs_cmd
|
|
152
152
|
)
|
|
153
|
-
ui_url = mlrun.utils.
|
|
153
|
+
ui_url = mlrun.utils.get_run_url(project, uid=uid, name=run.metadata.name)
|
|
154
154
|
if ui_url:
|
|
155
155
|
mlrun.utils.logger.info("Or click for UI", ui_url=ui_url)
|
mlrun/launcher/local.py
CHANGED
|
@@ -281,5 +281,9 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
|
|
|
281
281
|
# once the run is completed, and we can just push the notifications.
|
|
282
282
|
# Only push from jupyter, not from the CLI.
|
|
283
283
|
# "handler" and "dask" kinds are special cases of local runs which don't set local=True
|
|
284
|
-
if self._is_run_local or runtime.kind in ["handler"
|
|
284
|
+
if self._is_run_local or runtime.kind in ["handler"]:
|
|
285
285
|
mlrun.utils.notifications.NotificationPusher([runobj]).push()
|
|
286
|
+
elif runtime.kind in ["dask"]:
|
|
287
|
+
runtime._get_db().push_run_notifications(
|
|
288
|
+
uid=runobj.metadata.uid, project=runobj.metadata.project
|
|
289
|
+
)
|
|
@@ -166,7 +166,9 @@ class _ApplicationErrorHandler(StepToDict):
|
|
|
166
166
|
"Endpoint ID": event.body.endpoint_id,
|
|
167
167
|
"Application Class": event.body.application_name,
|
|
168
168
|
"Error": "".join(
|
|
169
|
-
traceback.format_exception(
|
|
169
|
+
traceback.format_exception(
|
|
170
|
+
None, value=event.error, tb=event.error.__traceback__
|
|
171
|
+
)
|
|
170
172
|
),
|
|
171
173
|
"Timestamp": event.timestamp,
|
|
172
174
|
}
|
|
@@ -12,14 +12,13 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import concurrent.futures
|
|
16
15
|
import datetime
|
|
17
16
|
import json
|
|
18
17
|
import os
|
|
19
18
|
from collections.abc import Iterator
|
|
20
19
|
from contextlib import AbstractContextManager
|
|
21
20
|
from types import TracebackType
|
|
22
|
-
from typing import NamedTuple, Optional, cast
|
|
21
|
+
from typing import Any, NamedTuple, Optional, cast
|
|
23
22
|
|
|
24
23
|
import nuclio_sdk
|
|
25
24
|
|
|
@@ -28,6 +27,10 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
|
28
27
|
import mlrun.feature_store as fstore
|
|
29
28
|
import mlrun.model_monitoring
|
|
30
29
|
from mlrun.common.schemas import EndpointType
|
|
30
|
+
from mlrun.common.schemas.model_monitoring.constants import (
|
|
31
|
+
ControllerEvent,
|
|
32
|
+
ControllerEventKind,
|
|
33
|
+
)
|
|
31
34
|
from mlrun.datastore import get_stream_pusher
|
|
32
35
|
from mlrun.errors import err_to_str
|
|
33
36
|
from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
|
|
@@ -140,6 +143,7 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
140
143
|
Initialize a batch window generator object that generates batch window objects
|
|
141
144
|
for the monitoring functions.
|
|
142
145
|
"""
|
|
146
|
+
self.batch_window: _BatchWindow = None
|
|
143
147
|
self._project = project
|
|
144
148
|
self._endpoint_id = endpoint_id
|
|
145
149
|
self._timedelta = window_length
|
|
@@ -199,14 +203,14 @@ class _BatchWindowGenerator(AbstractContextManager):
|
|
|
199
203
|
`first_request` and `last_request` are the timestamps of the first request and last
|
|
200
204
|
request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
|
|
201
205
|
"""
|
|
202
|
-
batch_window = _BatchWindow(
|
|
206
|
+
self.batch_window = _BatchWindow(
|
|
203
207
|
schedules_file=self._schedules_file,
|
|
204
208
|
application=application,
|
|
205
209
|
timedelta_seconds=self._timedelta,
|
|
206
210
|
last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
|
|
207
211
|
first_request=int(first_request.timestamp()),
|
|
208
212
|
)
|
|
209
|
-
yield from batch_window.get_intervals()
|
|
213
|
+
yield from self.batch_window.get_intervals()
|
|
210
214
|
|
|
211
215
|
|
|
212
216
|
def _get_window_length() -> int:
|
|
@@ -237,6 +241,7 @@ class MonitoringApplicationController:
|
|
|
237
241
|
self._window_length = _get_window_length()
|
|
238
242
|
|
|
239
243
|
self.model_monitoring_access_key = self._get_model_monitoring_access_key()
|
|
244
|
+
self.v3io_access_key = mlrun.get_secret_or_env("V3IO_ACCESS_KEY")
|
|
240
245
|
self.storage_options = None
|
|
241
246
|
if mlrun.mlconf.artifact_path.startswith("s3://"):
|
|
242
247
|
self.storage_options = mlrun.mlconf.get_s3_storage_options()
|
|
@@ -262,112 +267,65 @@ class MonitoringApplicationController:
|
|
|
262
267
|
!= mm_constants.EndpointType.ROUTER.value
|
|
263
268
|
)
|
|
264
269
|
|
|
265
|
-
def run(self) -> None:
|
|
270
|
+
def run(self, event: nuclio_sdk.Event) -> None:
|
|
266
271
|
"""
|
|
267
|
-
Main method for
|
|
272
|
+
Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
|
|
273
|
+
Handles nop events logic.
|
|
268
274
|
This method handles the following:
|
|
269
|
-
1.
|
|
270
|
-
2.
|
|
271
|
-
3.
|
|
272
|
-
4.
|
|
273
|
-
5. Delete old parquets
|
|
275
|
+
1. Read applications from the event (endpoint_policy)
|
|
276
|
+
2. Check model monitoring windows
|
|
277
|
+
3. Send data to applications
|
|
278
|
+
4. Pushes nop event to main stream if needed
|
|
274
279
|
"""
|
|
275
|
-
logger.info("Start running monitoring controller")
|
|
280
|
+
logger.info("Start running monitoring controller worker")
|
|
276
281
|
try:
|
|
277
|
-
|
|
278
|
-
endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
|
|
279
|
-
project=self.project, tsdb_metrics=True
|
|
280
|
-
)
|
|
281
|
-
endpoints = endpoints_list.endpoints
|
|
282
|
-
if not endpoints:
|
|
283
|
-
logger.info("No model endpoints found", project=self.project)
|
|
284
|
-
return
|
|
285
|
-
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
286
|
-
if monitoring_functions:
|
|
287
|
-
applications_names = list(
|
|
288
|
-
{app.metadata.name for app in monitoring_functions}
|
|
289
|
-
)
|
|
290
|
-
# if monitoring_functions: - TODO : ML-7700
|
|
291
|
-
# Gets only application in ready state
|
|
292
|
-
# applications_names = list(
|
|
293
|
-
# {
|
|
294
|
-
# app.metadata.name
|
|
295
|
-
# for app in monitoring_functions
|
|
296
|
-
# if (
|
|
297
|
-
# app.status.state == "ready"
|
|
298
|
-
# # workaround for the default app, as its `status.state` is `None`
|
|
299
|
-
# or app.metadata.name
|
|
300
|
-
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
301
|
-
# )
|
|
302
|
-
# }
|
|
303
|
-
# )
|
|
304
|
-
if not applications_names:
|
|
305
|
-
logger.info("No monitoring functions found", project=self.project)
|
|
306
|
-
return
|
|
307
|
-
logger.info(
|
|
308
|
-
"Starting to iterate over the applications",
|
|
309
|
-
applications=applications_names,
|
|
310
|
-
)
|
|
311
|
-
|
|
282
|
+
body = json.loads(event.body.decode("utf-8"))
|
|
312
283
|
except Exception as e:
|
|
313
284
|
logger.error(
|
|
314
|
-
"Failed to
|
|
285
|
+
"Failed to decode event",
|
|
315
286
|
exc=err_to_str(e),
|
|
316
287
|
)
|
|
317
288
|
return
|
|
318
|
-
#
|
|
319
|
-
|
|
320
|
-
max_workers=min(len(endpoints), 10)
|
|
321
|
-
) as pool:
|
|
322
|
-
for endpoint in endpoints:
|
|
323
|
-
if self._should_monitor_endpoint(endpoint):
|
|
324
|
-
pool.submit(
|
|
325
|
-
MonitoringApplicationController.model_endpoint_process,
|
|
326
|
-
project=self.project,
|
|
327
|
-
endpoint=endpoint,
|
|
328
|
-
applications_names=applications_names,
|
|
329
|
-
window_length=self._window_length,
|
|
330
|
-
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
331
|
-
storage_options=self.storage_options,
|
|
332
|
-
)
|
|
333
|
-
else:
|
|
334
|
-
logger.debug(
|
|
335
|
-
"Skipping endpoint, not ready or not suitable for monitoring",
|
|
336
|
-
endpoint_id=endpoint.metadata.uid,
|
|
337
|
-
endpoint_name=endpoint.metadata.name,
|
|
338
|
-
)
|
|
339
|
-
logger.info("Finished running monitoring controller")
|
|
289
|
+
# Run single endpoint process
|
|
290
|
+
self.model_endpoint_process(event=body)
|
|
340
291
|
|
|
341
|
-
@classmethod
|
|
342
292
|
def model_endpoint_process(
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
endpoint: mlrun.common.schemas.ModelEndpoint,
|
|
346
|
-
applications_names: list[str],
|
|
347
|
-
window_length: int,
|
|
348
|
-
model_monitoring_access_key: str,
|
|
349
|
-
storage_options: Optional[dict] = None,
|
|
293
|
+
self,
|
|
294
|
+
event: Optional[dict] = None,
|
|
350
295
|
) -> None:
|
|
351
296
|
"""
|
|
352
297
|
Process a model endpoint and trigger the monitoring applications. This function running on different process
|
|
353
|
-
for each endpoint.
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
:param endpoint: (dict) Model endpoint record.
|
|
357
|
-
:param applications_names: (list[str]) List of application names to push results to.
|
|
358
|
-
:param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
|
|
359
|
-
:param project: (str) Project name.
|
|
360
|
-
:param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
|
|
361
|
-
:param storage_options: (dict) Storage options for reading the infer parquet files.
|
|
298
|
+
for each endpoint.
|
|
299
|
+
|
|
300
|
+
:param event: (dict) Event that triggered the monitoring process.
|
|
362
301
|
"""
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
|
|
366
|
-
)
|
|
367
|
-
m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
|
|
302
|
+
logger.info("Model endpoint process started", event=event)
|
|
303
|
+
|
|
368
304
|
try:
|
|
305
|
+
project_name = event[ControllerEvent.PROJECT]
|
|
306
|
+
endpoint_id = event[ControllerEvent.ENDPOINT_ID]
|
|
307
|
+
endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
|
|
308
|
+
applications_names = event[ControllerEvent.ENDPOINT_POLICY][
|
|
309
|
+
"monitoring_applications"
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
not_batch_endpoint = (
|
|
313
|
+
event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
|
|
314
|
+
)
|
|
315
|
+
m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
|
|
316
|
+
logger.info(
|
|
317
|
+
"Starting analyzing for:", timestamp=event[ControllerEvent.TIMESTAMP]
|
|
318
|
+
)
|
|
319
|
+
last_stream_timestamp = datetime.datetime.fromisoformat(
|
|
320
|
+
event[ControllerEvent.TIMESTAMP]
|
|
321
|
+
)
|
|
322
|
+
first_request = datetime.datetime.fromisoformat(
|
|
323
|
+
event[ControllerEvent.FIRST_REQUEST]
|
|
324
|
+
)
|
|
369
325
|
with _BatchWindowGenerator(
|
|
370
|
-
project=
|
|
326
|
+
project=project_name,
|
|
327
|
+
endpoint_id=endpoint_id,
|
|
328
|
+
window_length=self._window_length,
|
|
371
329
|
) as batch_window_generator:
|
|
372
330
|
for application in applications_names:
|
|
373
331
|
for (
|
|
@@ -375,15 +333,15 @@ class MonitoringApplicationController:
|
|
|
375
333
|
end_infer_time,
|
|
376
334
|
) in batch_window_generator.get_intervals(
|
|
377
335
|
application=application,
|
|
378
|
-
first_request=endpoint.status.first_request,
|
|
379
|
-
last_request=endpoint.status.last_request,
|
|
380
336
|
not_batch_endpoint=not_batch_endpoint,
|
|
337
|
+
first_request=first_request,
|
|
338
|
+
last_request=last_stream_timestamp,
|
|
381
339
|
):
|
|
382
340
|
df = m_fs.to_dataframe(
|
|
383
341
|
start_time=start_infer_time,
|
|
384
342
|
end_time=end_infer_time,
|
|
385
343
|
time_column=mm_constants.EventFieldType.TIMESTAMP,
|
|
386
|
-
storage_options=storage_options,
|
|
344
|
+
storage_options=self.storage_options,
|
|
387
345
|
)
|
|
388
346
|
if len(df) == 0:
|
|
389
347
|
logger.info(
|
|
@@ -399,21 +357,53 @@ class MonitoringApplicationController:
|
|
|
399
357
|
end=end_infer_time,
|
|
400
358
|
endpoint_id=endpoint_id,
|
|
401
359
|
)
|
|
402
|
-
|
|
360
|
+
self._push_to_applications(
|
|
403
361
|
start_infer_time=start_infer_time,
|
|
404
362
|
end_infer_time=end_infer_time,
|
|
405
363
|
endpoint_id=endpoint_id,
|
|
406
|
-
endpoint_name=
|
|
407
|
-
project=
|
|
364
|
+
endpoint_name=endpoint_name,
|
|
365
|
+
project=project_name,
|
|
408
366
|
applications_names=[application],
|
|
409
|
-
model_monitoring_access_key=model_monitoring_access_key,
|
|
367
|
+
model_monitoring_access_key=self.model_monitoring_access_key,
|
|
410
368
|
)
|
|
411
|
-
|
|
369
|
+
base_period = event[ControllerEvent.ENDPOINT_POLICY]["base_period"]
|
|
370
|
+
current_time = mlrun.utils.datetime_now()
|
|
371
|
+
if (
|
|
372
|
+
current_time.timestamp()
|
|
373
|
+
- batch_window_generator.batch_window._get_last_analyzed()
|
|
374
|
+
>= datetime.timedelta(minutes=base_period).total_seconds()
|
|
375
|
+
and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
|
|
376
|
+
):
|
|
377
|
+
event = {
|
|
378
|
+
ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
|
|
379
|
+
ControllerEvent.PROJECT: project_name,
|
|
380
|
+
ControllerEvent.ENDPOINT_ID: endpoint_id,
|
|
381
|
+
ControllerEvent.ENDPOINT_NAME: endpoint_name,
|
|
382
|
+
ControllerEvent.TIMESTAMP: current_time.isoformat(
|
|
383
|
+
timespec="microseconds"
|
|
384
|
+
),
|
|
385
|
+
ControllerEvent.ENDPOINT_POLICY: event[
|
|
386
|
+
ControllerEvent.ENDPOINT_POLICY
|
|
387
|
+
],
|
|
388
|
+
ControllerEvent.ENDPOINT_TYPE: event[
|
|
389
|
+
ControllerEvent.ENDPOINT_TYPE
|
|
390
|
+
],
|
|
391
|
+
ControllerEvent.FEATURE_SET_URI: event[
|
|
392
|
+
ControllerEvent.FEATURE_SET_URI
|
|
393
|
+
],
|
|
394
|
+
ControllerEvent.FIRST_REQUEST: event[
|
|
395
|
+
ControllerEvent.FIRST_REQUEST
|
|
396
|
+
],
|
|
397
|
+
}
|
|
398
|
+
self._push_to_main_stream(
|
|
399
|
+
event=event,
|
|
400
|
+
endpoint_id=endpoint_id,
|
|
401
|
+
)
|
|
412
402
|
|
|
413
403
|
except Exception:
|
|
414
404
|
logger.exception(
|
|
415
405
|
"Encountered an exception",
|
|
416
|
-
endpoint_id=
|
|
406
|
+
endpoint_id=event[ControllerEvent.ENDPOINT_ID],
|
|
417
407
|
)
|
|
418
408
|
|
|
419
409
|
@staticmethod
|
|
@@ -465,6 +455,168 @@ class MonitoringApplicationController:
|
|
|
465
455
|
[data]
|
|
466
456
|
)
|
|
467
457
|
|
|
458
|
+
def push_regular_event_to_controller_stream(self, event: nuclio_sdk.Event) -> None:
|
|
459
|
+
"""
|
|
460
|
+
pushes a regular event to the controller stream.
|
|
461
|
+
:param event: the nuclio trigger event
|
|
462
|
+
"""
|
|
463
|
+
logger.info("Starting monitoring controller chief")
|
|
464
|
+
applications_names = []
|
|
465
|
+
db = mlrun.get_run_db()
|
|
466
|
+
endpoints = db.list_model_endpoints(
|
|
467
|
+
project=self.project, tsdb_metrics=True
|
|
468
|
+
).endpoints
|
|
469
|
+
if not endpoints:
|
|
470
|
+
logger.info("No model endpoints found", project=self.project)
|
|
471
|
+
return
|
|
472
|
+
monitoring_functions = self.project_obj.list_model_monitoring_functions()
|
|
473
|
+
if monitoring_functions:
|
|
474
|
+
# if monitoring_functions: - TODO : ML-7700
|
|
475
|
+
# Gets only application in ready state
|
|
476
|
+
# applications_names = list(
|
|
477
|
+
# {
|
|
478
|
+
# app.metadata.name
|
|
479
|
+
# for app in monitoring_functions
|
|
480
|
+
# if (
|
|
481
|
+
# app.status.state == "ready"
|
|
482
|
+
# # workaround for the default app, as its `status.state` is `None`
|
|
483
|
+
# or app.metadata.name
|
|
484
|
+
# == mm_constants.HistogramDataDriftApplicationConstants.NAME
|
|
485
|
+
# )
|
|
486
|
+
# }
|
|
487
|
+
# )
|
|
488
|
+
applications_names = list(
|
|
489
|
+
{app.metadata.name for app in monitoring_functions}
|
|
490
|
+
)
|
|
491
|
+
if not applications_names:
|
|
492
|
+
logger.info("No monitoring functions found", project=self.project)
|
|
493
|
+
return
|
|
494
|
+
policy = {
|
|
495
|
+
"monitoring_applications": applications_names,
|
|
496
|
+
"base_period": int(
|
|
497
|
+
batch_dict2timedelta(
|
|
498
|
+
json.loads(
|
|
499
|
+
cast(
|
|
500
|
+
str,
|
|
501
|
+
os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
|
|
502
|
+
)
|
|
503
|
+
)
|
|
504
|
+
).total_seconds()
|
|
505
|
+
// 60
|
|
506
|
+
),
|
|
507
|
+
}
|
|
508
|
+
for endpoint in endpoints:
|
|
509
|
+
if self._should_monitor_endpoint(endpoint):
|
|
510
|
+
logger.info(
|
|
511
|
+
"Regular event is being pushed to controller stream for model endpoint",
|
|
512
|
+
endpoint_id=endpoint.metadata.uid,
|
|
513
|
+
endpoint_name=endpoint.metadata.name,
|
|
514
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
515
|
+
sep=" ", timespec="microseconds"
|
|
516
|
+
),
|
|
517
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
518
|
+
sep=" ", timespec="microseconds"
|
|
519
|
+
),
|
|
520
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
521
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
522
|
+
endpoint_policy=json.dumps(policy),
|
|
523
|
+
)
|
|
524
|
+
self.push_to_controller_stream(
|
|
525
|
+
kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
|
|
526
|
+
project=self.project,
|
|
527
|
+
endpoint_id=endpoint.metadata.uid,
|
|
528
|
+
endpoint_name=endpoint.metadata.name,
|
|
529
|
+
stream_access_key=self.v3io_access_key,
|
|
530
|
+
timestamp=endpoint.status.last_request.isoformat(
|
|
531
|
+
sep=" ", timespec="microseconds"
|
|
532
|
+
),
|
|
533
|
+
first_request=endpoint.status.first_request.isoformat(
|
|
534
|
+
sep=" ", timespec="microseconds"
|
|
535
|
+
),
|
|
536
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
537
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
538
|
+
endpoint_policy=policy,
|
|
539
|
+
)
|
|
540
|
+
else:
|
|
541
|
+
logger.info(
|
|
542
|
+
"Should not monitor model endpoint, didn't push regular event",
|
|
543
|
+
endpoint_id=endpoint.metadata.uid,
|
|
544
|
+
endpoint_name=endpoint.metadata.name,
|
|
545
|
+
timestamp=endpoint.status.last_request,
|
|
546
|
+
first_request=endpoint.status.first_request,
|
|
547
|
+
endpoint_type=endpoint.metadata.endpoint_type,
|
|
548
|
+
feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
@staticmethod
|
|
552
|
+
def push_to_controller_stream(
|
|
553
|
+
kind: str,
|
|
554
|
+
project: str,
|
|
555
|
+
endpoint_id: str,
|
|
556
|
+
endpoint_name: str,
|
|
557
|
+
stream_access_key: str,
|
|
558
|
+
timestamp: str,
|
|
559
|
+
first_request: str,
|
|
560
|
+
endpoint_type: str,
|
|
561
|
+
feature_set_uri: str,
|
|
562
|
+
endpoint_policy: dict[str, Any],
|
|
563
|
+
) -> None:
|
|
564
|
+
"""
|
|
565
|
+
Pushes event data to controller stream.
|
|
566
|
+
:param timestamp: the event timestamp str isoformat utc timezone
|
|
567
|
+
:param first_request: the first request str isoformat utc timezone
|
|
568
|
+
:param endpoint_policy: dictionary hold the monitoring policy
|
|
569
|
+
:param kind: str event kind
|
|
570
|
+
:param project: project name
|
|
571
|
+
:param endpoint_id: endpoint id string
|
|
572
|
+
:param endpoint_name: the endpoint name string
|
|
573
|
+
:param endpoint_type: Enum of the endpoint type
|
|
574
|
+
:param feature_set_uri: the feature set uri string
|
|
575
|
+
:param stream_access_key: access key to apply the model monitoring process.
|
|
576
|
+
"""
|
|
577
|
+
stream_uri = get_stream_path(
|
|
578
|
+
project=project,
|
|
579
|
+
function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
|
|
580
|
+
)
|
|
581
|
+
event = {
|
|
582
|
+
ControllerEvent.KIND.value: kind,
|
|
583
|
+
ControllerEvent.PROJECT.value: project,
|
|
584
|
+
ControllerEvent.ENDPOINT_ID.value: endpoint_id,
|
|
585
|
+
ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
|
|
586
|
+
ControllerEvent.TIMESTAMP.value: timestamp,
|
|
587
|
+
ControllerEvent.FIRST_REQUEST.value: first_request,
|
|
588
|
+
ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
|
|
589
|
+
ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
|
|
590
|
+
ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
|
|
591
|
+
}
|
|
592
|
+
logger.info(
|
|
593
|
+
"Pushing data to controller stream",
|
|
594
|
+
event=event,
|
|
595
|
+
endpoint_id=endpoint_id,
|
|
596
|
+
stream_uri=stream_uri,
|
|
597
|
+
)
|
|
598
|
+
get_stream_pusher(stream_uri, access_key=stream_access_key).push(
|
|
599
|
+
[event], partition_key=endpoint_id
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
|
|
603
|
+
"""
|
|
604
|
+
Pushes the given event to model monitoring stream
|
|
605
|
+
:param event: event dictionary to push to stream
|
|
606
|
+
:param endpoint_id: endpoint id string
|
|
607
|
+
"""
|
|
608
|
+
stream_uri = get_stream_path(project=event.get(ControllerEvent.PROJECT))
|
|
609
|
+
|
|
610
|
+
logger.info(
|
|
611
|
+
"Pushing data to main stream, NOP event is been generated",
|
|
612
|
+
event=json.dumps(event),
|
|
613
|
+
endpoint_id=endpoint_id,
|
|
614
|
+
stream_uri=stream_uri,
|
|
615
|
+
)
|
|
616
|
+
get_stream_pusher(stream_uri, access_key=self.model_monitoring_access_key).push(
|
|
617
|
+
[event], partition_key=endpoint_id
|
|
618
|
+
)
|
|
619
|
+
|
|
468
620
|
|
|
469
621
|
def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
470
622
|
"""
|
|
@@ -473,4 +625,15 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
|
|
|
473
625
|
:param context: the Nuclio context
|
|
474
626
|
:param event: trigger event
|
|
475
627
|
"""
|
|
476
|
-
|
|
628
|
+
logger.info(
|
|
629
|
+
"Controller got event",
|
|
630
|
+
trigger=event.trigger,
|
|
631
|
+
trigger_kind=event.trigger.kind,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if event.trigger.kind == "http":
|
|
635
|
+
# Runs controller chief:
|
|
636
|
+
MonitoringApplicationController().push_regular_event_to_controller_stream(event)
|
|
637
|
+
else:
|
|
638
|
+
# Runs controller worker:
|
|
639
|
+
MonitoringApplicationController().run(event=event)
|