mlrun 1.8.0rc21__py3-none-any.whl → 1.8.0rc26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (47) hide show
  1. mlrun/__init__.py +37 -3
  2. mlrun/alerts/alert.py +1 -0
  3. mlrun/artifacts/document.py +78 -36
  4. mlrun/common/formatters/feature_set.py +1 -0
  5. mlrun/common/schemas/alert.py +3 -0
  6. mlrun/common/schemas/client_spec.py +0 -1
  7. mlrun/common/schemas/model_monitoring/constants.py +27 -9
  8. mlrun/common/schemas/workflow.py +1 -0
  9. mlrun/config.py +39 -6
  10. mlrun/datastore/datastore_profile.py +58 -16
  11. mlrun/datastore/sources.py +7 -1
  12. mlrun/datastore/vectorstore.py +20 -1
  13. mlrun/db/base.py +11 -0
  14. mlrun/db/httpdb.py +21 -9
  15. mlrun/db/nopdb.py +10 -0
  16. mlrun/errors.py +4 -0
  17. mlrun/execution.py +15 -6
  18. mlrun/launcher/client.py +2 -2
  19. mlrun/launcher/local.py +5 -1
  20. mlrun/model_monitoring/applications/_application_steps.py +3 -1
  21. mlrun/model_monitoring/controller.py +266 -103
  22. mlrun/model_monitoring/db/tsdb/__init__.py +11 -23
  23. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +5 -2
  24. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +8 -8
  25. mlrun/model_monitoring/helpers.py +16 -10
  26. mlrun/model_monitoring/stream_processing.py +85 -35
  27. mlrun/package/context_handler.py +1 -1
  28. mlrun/package/packagers_manager.py +4 -18
  29. mlrun/projects/pipelines.py +2 -2
  30. mlrun/projects/project.py +123 -38
  31. mlrun/runtimes/nuclio/serving.py +2 -2
  32. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  33. mlrun/secrets.py +1 -1
  34. mlrun/serving/server.py +11 -3
  35. mlrun/serving/states.py +65 -8
  36. mlrun/serving/v2_serving.py +16 -8
  37. mlrun/utils/helpers.py +81 -21
  38. mlrun/utils/notifications/notification/base.py +6 -1
  39. mlrun/utils/notifications/notification/slack.py +5 -1
  40. mlrun/utils/notifications/notification_pusher.py +13 -4
  41. mlrun/utils/version/version.json +2 -2
  42. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/METADATA +33 -16
  43. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/RECORD +47 -47
  44. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/WHEEL +1 -1
  45. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/LICENSE +0 -0
  46. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/entry_points.txt +0 -0
  47. {mlrun-1.8.0rc21.dist-info → mlrun-1.8.0rc26.dist-info}/top_level.txt +0 -0
mlrun/db/httpdb.py CHANGED
@@ -559,14 +559,6 @@ class HTTPRunDB(RunDBInterface):
559
559
  server_cfg.get("external_platform_tracking")
560
560
  or config.external_platform_tracking
561
561
  )
562
- config.model_endpoint_monitoring.tsdb_connection = (
563
- server_cfg.get("model_monitoring_tsdb_connection")
564
- or config.model_endpoint_monitoring.tsdb_connection
565
- )
566
- config.model_endpoint_monitoring.stream_connection = (
567
- server_cfg.get("stream_connection")
568
- or config.model_endpoint_monitoring.stream_connection
569
- )
570
562
  config.packagers = server_cfg.get("packagers") or config.packagers
571
563
  server_data_prefixes = server_cfg.get("feature_store_data_prefixes") or {}
572
564
  for prefix in ["default", "nosql", "redisnosql"]:
@@ -771,7 +763,6 @@ class HTTPRunDB(RunDBInterface):
771
763
  :returns: :py:class:`~mlrun.common.schemas.BackgroundTask`.
772
764
  """
773
765
  project = project or config.default_project
774
-
775
766
  response = self.api_call(
776
767
  "POST",
777
768
  path=f"projects/{project}/runs/{uid}/push-notifications",
@@ -5030,6 +5021,27 @@ class HTTPRunDB(RunDBInterface):
5030
5021
  **kwargs,
5031
5022
  )
5032
5023
 
5024
+ def get_alert_activation(
5025
+ self,
5026
+ project,
5027
+ activation_id,
5028
+ ) -> mlrun.common.schemas.AlertActivation:
5029
+ """
5030
+ Retrieve the alert activation by id
5031
+
5032
+ :param project: Project name for which the summary belongs.
5033
+ :param activation_id: alert activation id.
5034
+ :returns: alert activation object.
5035
+ """
5036
+ project = project or config.default_project
5037
+
5038
+ error = "get alert activation"
5039
+ path = f"projects/{project}/alert-activations/{activation_id}"
5040
+
5041
+ response = self.api_call("GET", path, error)
5042
+
5043
+ return mlrun.common.schemas.AlertActivation(**response.json())
5044
+
5033
5045
  def get_project_summary(
5034
5046
  self, project: Optional[str] = None
5035
5047
  ) -> mlrun.common.schemas.ProjectSummary:
mlrun/db/nopdb.py CHANGED
@@ -84,6 +84,9 @@ class NopDB(RunDBInterface):
84
84
  ):
85
85
  pass
86
86
 
87
+ def refresh_smtp_configuration(self):
88
+ pass
89
+
87
90
  def push_pipeline_notifications(
88
91
  self,
89
92
  pipeline_id,
@@ -945,5 +948,12 @@ class NopDB(RunDBInterface):
945
948
  ):
946
949
  pass
947
950
 
951
+ def get_alert_activation(
952
+ self,
953
+ project,
954
+ activation_id,
955
+ ) -> mlrun.common.schemas.AlertActivation:
956
+ pass
957
+
948
958
  def get_project_summary(self, project: str):
949
959
  pass
mlrun/errors.py CHANGED
@@ -174,6 +174,10 @@ class MLRunInvalidArgumentError(MLRunHTTPStatusError, ValueError):
174
174
  error_status_code = HTTPStatus.BAD_REQUEST.value
175
175
 
176
176
 
177
+ class MLRunModelLimitExceededError(MLRunHTTPStatusError, ValueError):
178
+ error_status_code = HTTPStatus.BAD_REQUEST.value
179
+
180
+
177
181
  class MLRunInvalidArgumentTypeError(MLRunHTTPStatusError, TypeError):
178
182
  error_status_code = HTTPStatus.BAD_REQUEST.value
179
183
 
mlrun/execution.py CHANGED
@@ -876,7 +876,7 @@ class MLClientCtx:
876
876
 
877
877
  def log_document(
878
878
  self,
879
- key: str,
879
+ key: str = "",
880
880
  tag: str = "",
881
881
  local_path: str = "",
882
882
  artifact_path: Optional[str] = None,
@@ -890,7 +890,8 @@ class MLClientCtx:
890
890
  """
891
891
  Log a document as an artifact.
892
892
 
893
- :param key: Artifact key
893
+ :param key: Optional artifact key. If not provided, will be derived from local_path
894
+ or target_path using DocumentArtifact.key_from_source()
894
895
  :param tag: Version tag
895
896
  :param local_path: path to the local file we upload, will also be use
896
897
  as the destination subpath (under "artifact_path")
@@ -923,7 +924,6 @@ class MLClientCtx:
923
924
  Example:
924
925
  >>> # Log a PDF document with custom loader
925
926
  >>> project.log_document(
926
- ... key="my_doc",
927
927
  ... local_path="path/to/doc.pdf",
928
928
  ... document_loader_spec=DocumentLoaderSpec(
929
929
  ... loader_class_name="langchain_community.document_loaders.PDFLoader",
@@ -932,10 +932,19 @@ class MLClientCtx:
932
932
  ... ),
933
933
  ... )
934
934
  """
935
+
936
+ if not key and not local_path and not target_path:
937
+ raise ValueError(
938
+ "Must provide either 'key' parameter or 'local_path'/'target_path' to derive the key from"
939
+ )
940
+ if not key:
941
+ key = DocumentArtifact.key_from_source(local_path or target_path)
942
+
935
943
  doc_artifact = DocumentArtifact(
936
944
  key=key,
937
945
  original_source=local_path or target_path,
938
946
  document_loader_spec=document_loader_spec,
947
+ collections=kwargs.pop("collections", None),
939
948
  **kwargs,
940
949
  )
941
950
 
@@ -964,12 +973,12 @@ class MLClientCtx:
964
973
  def get_artifact(
965
974
  self, key, tag=None, iter=None, tree=None, uid=None
966
975
  ) -> Optional[Artifact]:
967
- if tag or iter or tree or uid:
976
+ cached_artifact_uri = self._artifacts_manager.artifact_uris.get(key, None)
977
+ if tag or iter or tree or uid or (not cached_artifact_uri):
968
978
  project = self.get_project_object()
969
979
  return project.get_artifact(key=key, tag=tag, iter=iter, tree=tree, uid=uid)
970
980
  else:
971
- artifact_uri = self._artifacts_manager.artifact_uris[key]
972
- return self.get_store_resource(artifact_uri)
981
+ return self.get_store_resource(cached_artifact_uri)
973
982
 
974
983
  def update_artifact(self, artifact_object: Artifact):
975
984
  """Update an artifact object in the DB and the cached uri"""
mlrun/launcher/client.py CHANGED
@@ -134,7 +134,7 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
134
134
  if mlrun.utils.is_jupyter and mlrun.mlconf.ipython_widget:
135
135
  results_tbl.show()
136
136
  print()
137
- ui_url = mlrun.utils.get_ui_url(project, uid)
137
+ ui_url = mlrun.utils.get_run_url(project, uid=uid, name=run.metadata.name)
138
138
  if ui_url:
139
139
  ui_url = f' or <a href="{ui_url}" target="_blank">click here</a> to open in UI'
140
140
  IPython.display.display(
@@ -150,6 +150,6 @@ class ClientBaseLauncher(launcher.BaseLauncher, abc.ABC):
150
150
  mlrun.utils.logger.info(
151
151
  "To track results use the CLI", info_cmd=info_cmd, logs_cmd=logs_cmd
152
152
  )
153
- ui_url = mlrun.utils.get_ui_url(project, uid)
153
+ ui_url = mlrun.utils.get_run_url(project, uid=uid, name=run.metadata.name)
154
154
  if ui_url:
155
155
  mlrun.utils.logger.info("Or click for UI", ui_url=ui_url)
mlrun/launcher/local.py CHANGED
@@ -281,5 +281,9 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
281
281
  # once the run is completed, and we can just push the notifications.
282
282
  # Only push from jupyter, not from the CLI.
283
283
  # "handler" and "dask" kinds are special cases of local runs which don't set local=True
284
- if self._is_run_local or runtime.kind in ["handler", "dask"]:
284
+ if self._is_run_local or runtime.kind in ["handler"]:
285
285
  mlrun.utils.notifications.NotificationPusher([runobj]).push()
286
+ elif runtime.kind in ["dask"]:
287
+ runtime._get_db().push_run_notifications(
288
+ uid=runobj.metadata.uid, project=runobj.metadata.project
289
+ )
@@ -166,7 +166,9 @@ class _ApplicationErrorHandler(StepToDict):
166
166
  "Endpoint ID": event.body.endpoint_id,
167
167
  "Application Class": event.body.application_name,
168
168
  "Error": "".join(
169
- traceback.format_exception(None, event.error, event.error.__traceback__)
169
+ traceback.format_exception(
170
+ None, value=event.error, tb=event.error.__traceback__
171
+ )
170
172
  ),
171
173
  "Timestamp": event.timestamp,
172
174
  }
@@ -12,14 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import concurrent.futures
16
15
  import datetime
17
16
  import json
18
17
  import os
19
18
  from collections.abc import Iterator
20
19
  from contextlib import AbstractContextManager
21
20
  from types import TracebackType
22
- from typing import NamedTuple, Optional, cast
21
+ from typing import Any, NamedTuple, Optional, cast
23
22
 
24
23
  import nuclio_sdk
25
24
 
@@ -28,6 +27,10 @@ import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
27
  import mlrun.feature_store as fstore
29
28
  import mlrun.model_monitoring
30
29
  from mlrun.common.schemas import EndpointType
30
+ from mlrun.common.schemas.model_monitoring.constants import (
31
+ ControllerEvent,
32
+ ControllerEventKind,
33
+ )
31
34
  from mlrun.datastore import get_stream_pusher
32
35
  from mlrun.errors import err_to_str
33
36
  from mlrun.model_monitoring.db._schedules import ModelMonitoringSchedulesFile
@@ -140,6 +143,7 @@ class _BatchWindowGenerator(AbstractContextManager):
140
143
  Initialize a batch window generator object that generates batch window objects
141
144
  for the monitoring functions.
142
145
  """
146
+ self.batch_window: _BatchWindow = None
143
147
  self._project = project
144
148
  self._endpoint_id = endpoint_id
145
149
  self._timedelta = window_length
@@ -199,14 +203,14 @@ class _BatchWindowGenerator(AbstractContextManager):
199
203
  `first_request` and `last_request` are the timestamps of the first request and last
200
204
  request to the endpoint, respectively. They are guaranteed to be nonempty at this point.
201
205
  """
202
- batch_window = _BatchWindow(
206
+ self.batch_window = _BatchWindow(
203
207
  schedules_file=self._schedules_file,
204
208
  application=application,
205
209
  timedelta_seconds=self._timedelta,
206
210
  last_updated=self._get_last_updated_time(last_request, not_batch_endpoint),
207
211
  first_request=int(first_request.timestamp()),
208
212
  )
209
- yield from batch_window.get_intervals()
213
+ yield from self.batch_window.get_intervals()
210
214
 
211
215
 
212
216
  def _get_window_length() -> int:
@@ -237,6 +241,7 @@ class MonitoringApplicationController:
237
241
  self._window_length = _get_window_length()
238
242
 
239
243
  self.model_monitoring_access_key = self._get_model_monitoring_access_key()
244
+ self.v3io_access_key = mlrun.get_secret_or_env("V3IO_ACCESS_KEY")
240
245
  self.storage_options = None
241
246
  if mlrun.mlconf.artifact_path.startswith("s3://"):
242
247
  self.storage_options = mlrun.mlconf.get_s3_storage_options()
@@ -262,112 +267,65 @@ class MonitoringApplicationController:
262
267
  != mm_constants.EndpointType.ROUTER.value
263
268
  )
264
269
 
265
- def run(self) -> None:
270
+ def run(self, event: nuclio_sdk.Event) -> None:
266
271
  """
267
- Main method for run all the relevant monitoring applications on each endpoint.
272
+ Main method for controller chief, runs all the relevant monitoring applications for a single endpoint.
273
+ Handles nop events logic.
268
274
  This method handles the following:
269
- 1. List model endpoints
270
- 2. List applications
271
- 3. Check model monitoring windows
272
- 4. Send data to applications
273
- 5. Delete old parquets
275
+ 1. Read applications from the event (endpoint_policy)
276
+ 2. Check model monitoring windows
277
+ 3. Send data to applications
278
+ 4. Pushes nop event to main stream if needed
274
279
  """
275
- logger.info("Start running monitoring controller")
280
+ logger.info("Start running monitoring controller worker")
276
281
  try:
277
- applications_names = []
278
- endpoints_list = mlrun.db.get_run_db().list_model_endpoints(
279
- project=self.project, tsdb_metrics=True
280
- )
281
- endpoints = endpoints_list.endpoints
282
- if not endpoints:
283
- logger.info("No model endpoints found", project=self.project)
284
- return
285
- monitoring_functions = self.project_obj.list_model_monitoring_functions()
286
- if monitoring_functions:
287
- applications_names = list(
288
- {app.metadata.name for app in monitoring_functions}
289
- )
290
- # if monitoring_functions: - TODO : ML-7700
291
- # Gets only application in ready state
292
- # applications_names = list(
293
- # {
294
- # app.metadata.name
295
- # for app in monitoring_functions
296
- # if (
297
- # app.status.state == "ready"
298
- # # workaround for the default app, as its `status.state` is `None`
299
- # or app.metadata.name
300
- # == mm_constants.HistogramDataDriftApplicationConstants.NAME
301
- # )
302
- # }
303
- # )
304
- if not applications_names:
305
- logger.info("No monitoring functions found", project=self.project)
306
- return
307
- logger.info(
308
- "Starting to iterate over the applications",
309
- applications=applications_names,
310
- )
311
-
282
+ body = json.loads(event.body.decode("utf-8"))
312
283
  except Exception as e:
313
284
  logger.error(
314
- "Failed to list endpoints and monitoring applications",
285
+ "Failed to decode event",
315
286
  exc=err_to_str(e),
316
287
  )
317
288
  return
318
- # Initialize a thread pool that will be used to monitor each endpoint on a dedicated thread
319
- with concurrent.futures.ThreadPoolExecutor(
320
- max_workers=min(len(endpoints), 10)
321
- ) as pool:
322
- for endpoint in endpoints:
323
- if self._should_monitor_endpoint(endpoint):
324
- pool.submit(
325
- MonitoringApplicationController.model_endpoint_process,
326
- project=self.project,
327
- endpoint=endpoint,
328
- applications_names=applications_names,
329
- window_length=self._window_length,
330
- model_monitoring_access_key=self.model_monitoring_access_key,
331
- storage_options=self.storage_options,
332
- )
333
- else:
334
- logger.debug(
335
- "Skipping endpoint, not ready or not suitable for monitoring",
336
- endpoint_id=endpoint.metadata.uid,
337
- endpoint_name=endpoint.metadata.name,
338
- )
339
- logger.info("Finished running monitoring controller")
289
+ # Run single endpoint process
290
+ self.model_endpoint_process(event=body)
340
291
 
341
- @classmethod
342
292
  def model_endpoint_process(
343
- cls,
344
- project: str,
345
- endpoint: mlrun.common.schemas.ModelEndpoint,
346
- applications_names: list[str],
347
- window_length: int,
348
- model_monitoring_access_key: str,
349
- storage_options: Optional[dict] = None,
293
+ self,
294
+ event: Optional[dict] = None,
350
295
  ) -> None:
351
296
  """
352
297
  Process a model endpoint and trigger the monitoring applications. This function running on different process
353
- for each endpoint. In addition, this function will generate a parquet file that includes the relevant data
354
- for a specific time range.
355
-
356
- :param endpoint: (dict) Model endpoint record.
357
- :param applications_names: (list[str]) List of application names to push results to.
358
- :param batch_window_generator: (_BatchWindowGenerator) An object that generates _BatchWindow objects.
359
- :param project: (str) Project name.
360
- :param model_monitoring_access_key: (str) Access key to apply the model monitoring process.
361
- :param storage_options: (dict) Storage options for reading the infer parquet files.
298
+ for each endpoint.
299
+
300
+ :param event: (dict) Event that triggered the monitoring process.
362
301
  """
363
- endpoint_id = endpoint.metadata.uid
364
- not_batch_endpoint = not (
365
- endpoint.metadata.endpoint_type == EndpointType.BATCH_EP
366
- )
367
- m_fs = fstore.get_feature_set(endpoint.spec.monitoring_feature_set_uri)
302
+ logger.info("Model endpoint process started", event=event)
303
+
368
304
  try:
305
+ project_name = event[ControllerEvent.PROJECT]
306
+ endpoint_id = event[ControllerEvent.ENDPOINT_ID]
307
+ endpoint_name = event[ControllerEvent.ENDPOINT_NAME]
308
+ applications_names = event[ControllerEvent.ENDPOINT_POLICY][
309
+ "monitoring_applications"
310
+ ]
311
+
312
+ not_batch_endpoint = (
313
+ event[ControllerEvent.ENDPOINT_POLICY] != EndpointType.BATCH_EP
314
+ )
315
+ m_fs = fstore.get_feature_set(event[ControllerEvent.FEATURE_SET_URI])
316
+ logger.info(
317
+ "Starting analyzing for:", timestamp=event[ControllerEvent.TIMESTAMP]
318
+ )
319
+ last_stream_timestamp = datetime.datetime.fromisoformat(
320
+ event[ControllerEvent.TIMESTAMP]
321
+ )
322
+ first_request = datetime.datetime.fromisoformat(
323
+ event[ControllerEvent.FIRST_REQUEST]
324
+ )
369
325
  with _BatchWindowGenerator(
370
- project=project, endpoint_id=endpoint_id, window_length=window_length
326
+ project=project_name,
327
+ endpoint_id=endpoint_id,
328
+ window_length=self._window_length,
371
329
  ) as batch_window_generator:
372
330
  for application in applications_names:
373
331
  for (
@@ -375,15 +333,15 @@ class MonitoringApplicationController:
375
333
  end_infer_time,
376
334
  ) in batch_window_generator.get_intervals(
377
335
  application=application,
378
- first_request=endpoint.status.first_request,
379
- last_request=endpoint.status.last_request,
380
336
  not_batch_endpoint=not_batch_endpoint,
337
+ first_request=first_request,
338
+ last_request=last_stream_timestamp,
381
339
  ):
382
340
  df = m_fs.to_dataframe(
383
341
  start_time=start_infer_time,
384
342
  end_time=end_infer_time,
385
343
  time_column=mm_constants.EventFieldType.TIMESTAMP,
386
- storage_options=storage_options,
344
+ storage_options=self.storage_options,
387
345
  )
388
346
  if len(df) == 0:
389
347
  logger.info(
@@ -399,21 +357,53 @@ class MonitoringApplicationController:
399
357
  end=end_infer_time,
400
358
  endpoint_id=endpoint_id,
401
359
  )
402
- cls._push_to_applications(
360
+ self._push_to_applications(
403
361
  start_infer_time=start_infer_time,
404
362
  end_infer_time=end_infer_time,
405
363
  endpoint_id=endpoint_id,
406
- endpoint_name=endpoint.metadata.name,
407
- project=project,
364
+ endpoint_name=endpoint_name,
365
+ project=project_name,
408
366
  applications_names=[application],
409
- model_monitoring_access_key=model_monitoring_access_key,
367
+ model_monitoring_access_key=self.model_monitoring_access_key,
410
368
  )
411
- logger.info("Finished processing endpoint", endpoint_id=endpoint_id)
369
+ base_period = event[ControllerEvent.ENDPOINT_POLICY]["base_period"]
370
+ current_time = mlrun.utils.datetime_now()
371
+ if (
372
+ current_time.timestamp()
373
+ - batch_window_generator.batch_window._get_last_analyzed()
374
+ >= datetime.timedelta(minutes=base_period).total_seconds()
375
+ and event[ControllerEvent.KIND] != ControllerEventKind.NOP_EVENT
376
+ ):
377
+ event = {
378
+ ControllerEvent.KIND: mm_constants.ControllerEventKind.NOP_EVENT,
379
+ ControllerEvent.PROJECT: project_name,
380
+ ControllerEvent.ENDPOINT_ID: endpoint_id,
381
+ ControllerEvent.ENDPOINT_NAME: endpoint_name,
382
+ ControllerEvent.TIMESTAMP: current_time.isoformat(
383
+ timespec="microseconds"
384
+ ),
385
+ ControllerEvent.ENDPOINT_POLICY: event[
386
+ ControllerEvent.ENDPOINT_POLICY
387
+ ],
388
+ ControllerEvent.ENDPOINT_TYPE: event[
389
+ ControllerEvent.ENDPOINT_TYPE
390
+ ],
391
+ ControllerEvent.FEATURE_SET_URI: event[
392
+ ControllerEvent.FEATURE_SET_URI
393
+ ],
394
+ ControllerEvent.FIRST_REQUEST: event[
395
+ ControllerEvent.FIRST_REQUEST
396
+ ],
397
+ }
398
+ self._push_to_main_stream(
399
+ event=event,
400
+ endpoint_id=endpoint_id,
401
+ )
412
402
 
413
403
  except Exception:
414
404
  logger.exception(
415
405
  "Encountered an exception",
416
- endpoint_id=endpoint.metadata.uid,
406
+ endpoint_id=event[ControllerEvent.ENDPOINT_ID],
417
407
  )
418
408
 
419
409
  @staticmethod
@@ -465,6 +455,168 @@ class MonitoringApplicationController:
465
455
  [data]
466
456
  )
467
457
 
458
+ def push_regular_event_to_controller_stream(self, event: nuclio_sdk.Event) -> None:
459
+ """
460
+ pushes a regular event to the controller stream.
461
+ :param event: the nuclio trigger event
462
+ """
463
+ logger.info("Starting monitoring controller chief")
464
+ applications_names = []
465
+ db = mlrun.get_run_db()
466
+ endpoints = db.list_model_endpoints(
467
+ project=self.project, tsdb_metrics=True
468
+ ).endpoints
469
+ if not endpoints:
470
+ logger.info("No model endpoints found", project=self.project)
471
+ return
472
+ monitoring_functions = self.project_obj.list_model_monitoring_functions()
473
+ if monitoring_functions:
474
+ # if monitoring_functions: - TODO : ML-7700
475
+ # Gets only application in ready state
476
+ # applications_names = list(
477
+ # {
478
+ # app.metadata.name
479
+ # for app in monitoring_functions
480
+ # if (
481
+ # app.status.state == "ready"
482
+ # # workaround for the default app, as its `status.state` is `None`
483
+ # or app.metadata.name
484
+ # == mm_constants.HistogramDataDriftApplicationConstants.NAME
485
+ # )
486
+ # }
487
+ # )
488
+ applications_names = list(
489
+ {app.metadata.name for app in monitoring_functions}
490
+ )
491
+ if not applications_names:
492
+ logger.info("No monitoring functions found", project=self.project)
493
+ return
494
+ policy = {
495
+ "monitoring_applications": applications_names,
496
+ "base_period": int(
497
+ batch_dict2timedelta(
498
+ json.loads(
499
+ cast(
500
+ str,
501
+ os.getenv(mm_constants.EventFieldType.BATCH_INTERVALS_DICT),
502
+ )
503
+ )
504
+ ).total_seconds()
505
+ // 60
506
+ ),
507
+ }
508
+ for endpoint in endpoints:
509
+ if self._should_monitor_endpoint(endpoint):
510
+ logger.info(
511
+ "Regular event is being pushed to controller stream for model endpoint",
512
+ endpoint_id=endpoint.metadata.uid,
513
+ endpoint_name=endpoint.metadata.name,
514
+ timestamp=endpoint.status.last_request.isoformat(
515
+ sep=" ", timespec="microseconds"
516
+ ),
517
+ first_request=endpoint.status.first_request.isoformat(
518
+ sep=" ", timespec="microseconds"
519
+ ),
520
+ endpoint_type=endpoint.metadata.endpoint_type,
521
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
522
+ endpoint_policy=json.dumps(policy),
523
+ )
524
+ self.push_to_controller_stream(
525
+ kind=mm_constants.ControllerEventKind.REGULAR_EVENT,
526
+ project=self.project,
527
+ endpoint_id=endpoint.metadata.uid,
528
+ endpoint_name=endpoint.metadata.name,
529
+ stream_access_key=self.v3io_access_key,
530
+ timestamp=endpoint.status.last_request.isoformat(
531
+ sep=" ", timespec="microseconds"
532
+ ),
533
+ first_request=endpoint.status.first_request.isoformat(
534
+ sep=" ", timespec="microseconds"
535
+ ),
536
+ endpoint_type=endpoint.metadata.endpoint_type,
537
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
538
+ endpoint_policy=policy,
539
+ )
540
+ else:
541
+ logger.info(
542
+ "Should not monitor model endpoint, didn't push regular event",
543
+ endpoint_id=endpoint.metadata.uid,
544
+ endpoint_name=endpoint.metadata.name,
545
+ timestamp=endpoint.status.last_request,
546
+ first_request=endpoint.status.first_request,
547
+ endpoint_type=endpoint.metadata.endpoint_type,
548
+ feature_set_uri=endpoint.spec.monitoring_feature_set_uri,
549
+ )
550
+
551
+ @staticmethod
552
+ def push_to_controller_stream(
553
+ kind: str,
554
+ project: str,
555
+ endpoint_id: str,
556
+ endpoint_name: str,
557
+ stream_access_key: str,
558
+ timestamp: str,
559
+ first_request: str,
560
+ endpoint_type: str,
561
+ feature_set_uri: str,
562
+ endpoint_policy: dict[str, Any],
563
+ ) -> None:
564
+ """
565
+ Pushes event data to controller stream.
566
+ :param timestamp: the event timestamp str isoformat utc timezone
567
+ :param first_request: the first request str isoformat utc timezone
568
+ :param endpoint_policy: dictionary hold the monitoring policy
569
+ :param kind: str event kind
570
+ :param project: project name
571
+ :param endpoint_id: endpoint id string
572
+ :param endpoint_name: the endpoint name string
573
+ :param endpoint_type: Enum of the endpoint type
574
+ :param feature_set_uri: the feature set uri string
575
+ :param stream_access_key: access key to apply the model monitoring process.
576
+ """
577
+ stream_uri = get_stream_path(
578
+ project=project,
579
+ function_name=mm_constants.MonitoringFunctionNames.APPLICATION_CONTROLLER,
580
+ )
581
+ event = {
582
+ ControllerEvent.KIND.value: kind,
583
+ ControllerEvent.PROJECT.value: project,
584
+ ControllerEvent.ENDPOINT_ID.value: endpoint_id,
585
+ ControllerEvent.ENDPOINT_NAME.value: endpoint_name,
586
+ ControllerEvent.TIMESTAMP.value: timestamp,
587
+ ControllerEvent.FIRST_REQUEST.value: first_request,
588
+ ControllerEvent.ENDPOINT_TYPE.value: endpoint_type,
589
+ ControllerEvent.FEATURE_SET_URI.value: feature_set_uri,
590
+ ControllerEvent.ENDPOINT_POLICY.value: endpoint_policy,
591
+ }
592
+ logger.info(
593
+ "Pushing data to controller stream",
594
+ event=event,
595
+ endpoint_id=endpoint_id,
596
+ stream_uri=stream_uri,
597
+ )
598
+ get_stream_pusher(stream_uri, access_key=stream_access_key).push(
599
+ [event], partition_key=endpoint_id
600
+ )
601
+
602
+ def _push_to_main_stream(self, event: dict, endpoint_id: str) -> None:
603
+ """
604
+ Pushes the given event to model monitoring stream
605
+ :param event: event dictionary to push to stream
606
+ :param endpoint_id: endpoint id string
607
+ """
608
+ stream_uri = get_stream_path(project=event.get(ControllerEvent.PROJECT))
609
+
610
+ logger.info(
611
+ "Pushing data to main stream, NOP event is been generated",
612
+ event=json.dumps(event),
613
+ endpoint_id=endpoint_id,
614
+ stream_uri=stream_uri,
615
+ )
616
+ get_stream_pusher(stream_uri, access_key=self.model_monitoring_access_key).push(
617
+ [event], partition_key=endpoint_id
618
+ )
619
+
468
620
 
469
621
  def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
470
622
  """
@@ -473,4 +625,15 @@ def handler(context: nuclio_sdk.Context, event: nuclio_sdk.Event) -> None:
473
625
  :param context: the Nuclio context
474
626
  :param event: trigger event
475
627
  """
476
- MonitoringApplicationController().run()
628
+ logger.info(
629
+ "Controller got event",
630
+ trigger=event.trigger,
631
+ trigger_kind=event.trigger.kind,
632
+ )
633
+
634
+ if event.trigger.kind == "http":
635
+ # Runs controller chief:
636
+ MonitoringApplicationController().push_regular_event_to_controller_stream(event)
637
+ else:
638
+ # Runs controller worker:
639
+ MonitoringApplicationController().run(event=event)